diff --git a/megablocks/layers/testing.py b/tests/layers/architectures.py
similarity index 84%
rename from megablocks/layers/testing.py
rename to tests/layers/architectures.py
index 4cd9500..da1c595 100644
--- a/megablocks/layers/testing.py
+++ b/tests/layers/architectures.py
@@ -7,15 +7,6 @@
 from megablocks.layers.arguments import Arguments
 
 
-def allclose(x, y, pct=0.5):
-    mask = torch.isclose(x, y, rtol=5e-2)
-    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
-    if pct_diff > pct:
-        print('{:.2f}% of values not close.'.format(pct_diff))
-        return False
-    return True
-
-
 class FFN(torch.nn.Module):
 
     def __init__(self, args: Arguments):
diff --git a/tests/layers/dmoe_test.py b/tests/layers/dmoe_test.py
index a737ef4..3d6565c 100644
--- a/tests/layers/dmoe_test.py
+++ b/tests/layers/dmoe_test.py
@@ -7,8 +7,10 @@
 import torch
 
 from megablocks import grouped_gemm_util as gg
-from megablocks.layers import dmoe, moe, testing
 from megablocks.layers.arguments import Arguments
+from megablocks.layers.dmoe import dMoE
+from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from tests.layers.architectures import FFN
 
 # min size: (1, 2, 128, 2, 1)
 _FORWARD_TESTS_DEFAULT = (
@@ -64,9 +66,9 @@ def construct_moes(
         bf16=True,
     )
 
-    mlp = testing.FFN(args)
-    moe_mlp = moe.MoE(args)
-    dmoe_mlp = dmoe.dMoE(args)
+    mlp = FFN(args)
+    moe_mlp = MoE(args)
+    dmoe_mlp = dMoE(args)
 
     mlp.cuda(torch.cuda.current_device()).to(torch.bfloat16)
     moe_mlp.cuda(torch.cuda.current_device()).to(torch.bfloat16)
@@ -106,7 +108,7 @@ def test_dmoe_forward(
 
     out, _ = layer(x)
     assert out.shape == x.shape
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()
 
 
 @pytest.mark.gpu
@@ -132,12 +134,12 @@ def test_dmoe_forward_backward(
 
     out, _ = layer(x)
     assert out.shape == x.shape
-    loss = out.sum() + moe.batched_load_balancing_loss(args)
+    loss = out.sum() + batched_load_balancing_loss(args)
     loss.backward()
     assert x.grad is not None
     layer.zero_grad(set_to_none=True)
     x.grad = None
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()
 
 
 @pytest.mark.gpu
diff --git a/tests/layers/glu_test.py b/tests/layers/glu_test.py
index d89af89..1e031de 100644
--- a/tests/layers/glu_test.py
+++ b/tests/layers/glu_test.py
@@ -7,8 +7,9 @@
 import stk
 import torch
 
-from megablocks.layers import dmlp_registry, testing
+from megablocks.layers import dmlp_registry
 from megablocks.layers.arguments import Arguments
+from tests.layers.architectures import GLU
 
 _DENSE_TESTS = (
     (16, 1024, 512),
@@ -36,7 +37,7 @@ def construct_dmoe_glu(
         bf16=True,
     )
 
-    glu = testing.GLU(args)
+    glu = GLU(args)
     dmoe_glu = dmlp_registry.get(args)
 
     dmoe_glu.cuda(torch.cuda.current_device()).to(torch.bfloat16)
diff --git a/tests/layers/moe_test.py b/tests/layers/moe_test.py
index dd40ef9..ffd32cb 100644
--- a/tests/layers/moe_test.py
+++ b/tests/layers/moe_test.py
@@ -6,8 +6,9 @@
 import pytest
 import torch
 
-from megablocks.layers import moe, testing
 from megablocks.layers.arguments import Arguments
+from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from tests.layers.architectures import FFN
 
 _FORWARD_TESTS = (
     (16, 1024, 512, 1, 1),
@@ -48,8 +49,8 @@ def construct_moe(
         init_method=init_method,
     )
 
-    mlp = testing.FFN(args)
-    moe_mlp = moe.MoE(args)
+    mlp = FFN(args)
+    moe_mlp = MoE(args)
 
     mlp.cuda(torch.cuda.current_device()).half()
     moe_mlp.cuda(torch.cuda.current_device()).half()
@@ -76,7 +77,7 @@ def test_moe_forward(bs: int, sl: int, hs: int, num_experts: int, top_k: int):
 
     out, _ = layer(x)
     assert out.shape == x.shape
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()
 
 
 @pytest.mark.gpu
@@ -101,11 +102,11 @@ def test_moe_forward_backward(
     out, _ = layer(x)
     assert out.shape == x.shape
 
-    loss = out.sum() + moe.batched_load_balancing_loss(args)
+    loss = out.sum() + batched_load_balancing_loss(args)
     loss.backward()
     layer.zero_grad(set_to_none=True)
     x.grad = None
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()
 
 
 @pytest.mark.gpu
@@ -119,7 +120,7 @@ def test_moe_forward_vs_dense(bs: int, sl: int, hs: int):
     out, _ = moe_mlp(x)
     assert out.shape == x.shape == expected_out.shape
     assert torch.allclose(out, expected_out)
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()
 
 
 @pytest.mark.gpu
@@ -137,7 +138,7 @@ def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int):
     w2_grad = moe_mlp.experts.mlp.w2.grad.detach().squeeze()
     moe_mlp.zero_grad(set_to_none=True)
     x.grad = None
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()
 
     expected_out = mlp(x)
     expected_loss = expected_out.sum()
@@ -152,4 +153,4 @@ def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int):
     assert w2_grad.shape == expected_w2_grad.shape
     assert torch.allclose(w1_grad, expected_w1_grad)
     assert torch.allclose(w2_grad, expected_w2_grad)
-    moe.clear_load_balancing_loss()
+    clear_load_balancing_loss()