diff --git a/megablocks/layers/testing.py b/tests/layers/architectures.py similarity index 84% rename from megablocks/layers/testing.py rename to tests/layers/architectures.py index 4cd9500..da1c595 100644 --- a/megablocks/layers/testing.py +++ b/tests/layers/architectures.py @@ -7,15 +7,6 @@ from megablocks.layers.arguments import Arguments -def allclose(x, y, pct=0.5): - mask = torch.isclose(x, y, rtol=5e-2) - pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100 - if pct_diff > pct: - print('{:.2f}% of values not close.'.format(pct_diff)) - return False - return True - - class FFN(torch.nn.Module): def __init__(self, args: Arguments): diff --git a/tests/layers/dmoe_test.py b/tests/layers/dmoe_test.py index a737ef4..3d6565c 100644 --- a/tests/layers/dmoe_test.py +++ b/tests/layers/dmoe_test.py @@ -7,8 +7,10 @@ import torch from megablocks import grouped_gemm_util as gg -from megablocks.layers import dmoe, moe, testing from megablocks.layers.arguments import Arguments +from megablocks.layers.dmoe import dMoE +from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss +from tests.layers.architectures import FFN # min size: (1, 2, 128, 2, 1) _FORWARD_TESTS_DEFAULT = ( @@ -64,9 +66,9 @@ def construct_moes( bf16=True, ) - mlp = testing.FFN(args) - moe_mlp = moe.MoE(args) - dmoe_mlp = dmoe.dMoE(args) + mlp = FFN(args) + moe_mlp = MoE(args) + dmoe_mlp = dMoE(args) mlp.cuda(torch.cuda.current_device()).to(torch.bfloat16) moe_mlp.cuda(torch.cuda.current_device()).to(torch.bfloat16) @@ -106,7 +108,7 @@ def test_dmoe_forward( out, _ = layer(x) assert out.shape == x.shape - moe.clear_load_balancing_loss() + clear_load_balancing_loss() @pytest.mark.gpu @@ -132,12 +134,12 @@ def test_dmoe_forward_backward( out, _ = layer(x) assert out.shape == x.shape - loss = out.sum() + moe.batched_load_balancing_loss(args) + loss = out.sum() + batched_load_balancing_loss(args) loss.backward() assert x.grad is not None layer.zero_grad(set_to_none=True) x.grad = None - moe.clear_load_balancing_loss() + clear_load_balancing_loss() @pytest.mark.gpu diff --git a/tests/layers/glu_test.py b/tests/layers/glu_test.py index d89af89..1e031de 100644 --- a/tests/layers/glu_test.py +++ b/tests/layers/glu_test.py @@ -7,8 +7,9 @@ import stk import torch -from megablocks.layers import dmlp_registry, testing +from megablocks.layers import dmlp_registry from megablocks.layers.arguments import Arguments +from tests.layers.architectures import GLU _DENSE_TESTS = ( (16, 1024, 512), @@ -36,7 +37,7 @@ def construct_dmoe_glu( bf16=True, ) - glu = testing.GLU(args) + glu = GLU(args) dmoe_glu = dmlp_registry.get(args) dmoe_glu.cuda(torch.cuda.current_device()).to(torch.bfloat16) diff --git a/tests/layers/moe_test.py b/tests/layers/moe_test.py index dd40ef9..ffd32cb 100644 --- a/tests/layers/moe_test.py +++ b/tests/layers/moe_test.py @@ -6,8 +6,9 @@ import pytest import torch -from megablocks.layers import moe, testing from megablocks.layers.arguments import Arguments +from megablocks.layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss +from tests.layers.architectures import FFN _FORWARD_TESTS = ( (16, 1024, 512, 1, 1), @@ -48,8 +49,8 @@ def construct_moe( init_method=init_method, ) - mlp = testing.FFN(args) - moe_mlp = moe.MoE(args) + mlp = FFN(args) + moe_mlp = MoE(args) mlp.cuda(torch.cuda.current_device()).half() moe_mlp.cuda(torch.cuda.current_device()).half() @@ -76,7 +77,7 @@ def test_moe_forward(bs: int, sl: int, hs: int, num_experts: int, top_k: int): out, _ = layer(x) assert out.shape == x.shape - moe.clear_load_balancing_loss() + clear_load_balancing_loss() @pytest.mark.gpu @@ -101,11 +102,11 @@ def test_moe_forward_backward( out, _ = layer(x) assert out.shape == x.shape - loss = out.sum() + moe.batched_load_balancing_loss(args) + loss = out.sum() + batched_load_balancing_loss(args) loss.backward() layer.zero_grad(set_to_none=True) x.grad = None - moe.clear_load_balancing_loss() + clear_load_balancing_loss() @pytest.mark.gpu @@ -119,7 +120,7 @@ def test_moe_forward_vs_dense(bs: int, sl: int, hs: int): out, _ = moe_mlp(x) assert out.shape == x.shape == expected_out.shape assert torch.allclose(out, expected_out) - moe.clear_load_balancing_loss() + clear_load_balancing_loss() @pytest.mark.gpu @@ -137,7 +138,7 @@ def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int): w2_grad = moe_mlp.experts.mlp.w2.grad.detach().squeeze() moe_mlp.zero_grad(set_to_none=True) x.grad = None - moe.clear_load_balancing_loss() + clear_load_balancing_loss() expected_out = mlp(x) expected_loss = expected_out.sum() @@ -152,4 +153,4 @@ def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int): assert w2_grad.shape == expected_w2_grad.shape assert torch.allclose(w1_grad, expected_w1_grad) assert torch.allclose(w2_grad, expected_w2_grad) - moe.clear_load_balancing_loss() + clear_load_balancing_loss()