diff --git a/examples/02-ops.rs b/examples/02-ops.rs
index bba44fa61..240663dad 100644
--- a/examples/02-ops.rs
+++ b/examples/02-ops.rs
@@ -3,7 +3,7 @@
 use dfdx::{
     shapes::{Rank0, Rank1, Rank2},
     tensor::{AsArray, AutoDevice, SampleTensor, Tensor},
-    tensor_ops::{MeanTo, TryMatMul},
+    tensor_ops::{MeanTo, TryStaticMatMul},
 };
 
 fn main() {
diff --git a/examples/03-nn.rs b/examples/03-nn.rs
index c3a532471..97c633064 100644
--- a/examples/03-nn.rs
+++ b/examples/03-nn.rs
@@ -33,7 +33,7 @@ fn main() {
 
     // Even dynamic size is supported;
     let batch_size = 3;
-    let _: Tensor<(usize, Const<2>), f32, _> = m.forward(dev.zeros_like(&(batch_size, Const)));
+    let _: Tensor<(usize, Const<2>), f32, _> = m.forward(dev.zeros_like(&(batch_size, Const::<2>)));
 
     // you can also combine multiple modules with tuples
     type Mlp = (Linear<4, 2>, ReLU, Linear<2, 1>);
diff --git a/examples/04-gradients.rs b/examples/04-gradients.rs
index b7357a18c..1975d9fb0 100644
--- a/examples/04-gradients.rs
+++ b/examples/04-gradients.rs
@@ -4,7 +4,7 @@ use dfdx::{
     nn::ZeroGrads,
     shapes::{Rank0, Rank2},
     tensor::{AsArray, AutoDevice, Gradients, NoneTape, OwnedTape, SampleTensor, Tensor, Trace},
-    tensor_ops::{Backward, MeanTo, TryMatMul},
+    tensor_ops::{Backward, MeanTo, TryStaticMatMul},
 };
 
 fn main() {
diff --git a/src/nn/linear.rs b/src/nn/linear.rs
index f666bd590..5e95537ae 100644
--- a/src/nn/linear.rs
+++ b/src/nn/linear.rs
@@ -20,6 +20,59 @@ where
     }
 }
 
+pub trait AssertLayerMatch<Rhs: Shape> {
+    const TYPE_CHECK: ();
+    fn assert_dim_eq(&self);
+}
+
+impl<const M: usize, const I: usize, const O: usize> AssertLayerMatch<(Const<I>, Const<O>)>
+    for Rank1<M>
+{
+    const TYPE_CHECK: () = assert!(
+        M == I,
+        "You are trying to stack tensors, whose outgoing and ingoing dimensions do not match",
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as AssertLayerMatch<(Const<I>, Const<O>)>>::TYPE_CHECK;
+    }
+}
+
+impl<IN, const OUT: usize, const I: usize, const O: usize> AssertLayerMatch<(Const<I>, Const<O>)>
+    for (IN, Const<OUT>)
+{
+    const TYPE_CHECK: () = assert!(
+        OUT == I,
+        "You are trying to stack tensors, whose outgoing and ingoing dimensions do not match",
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as AssertLayerMatch<(Const<I>, Const<O>)>>::TYPE_CHECK;
+    }
+}
+
+impl<B, IN, const OUT: usize, const I: usize, const O: usize> AssertLayerMatch<(Const<I>, Const<O>)>
+    for (B, IN, Const<OUT>)
+{
+    const TYPE_CHECK: () = assert!(
+        OUT == I,
+        "You are trying to stack tensors, whose outgoing and ingoing dimensions do not match",
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as AssertLayerMatch<(Const<I>, Const<O>)>>::TYPE_CHECK;
+    }
+}
+
+// impl<S1: Dim, const O: usize, const IN: usize, const OUT: usize> AssertLayerMatch<Rank2<S1, O>>
+//     for Rank2<IN, OUT>
+// {
+//     const TYPE_CHECK: () = assert!(
+//         OUT == I,
+//         "You are trying to stack tensors, whose outgoing and ingoing dimensions do not match {I}",
+//     );
+//     fn assert_dim_eq(&self) {
+//         let _ = <Self as AssertLayerMatch<Rank2<I, O>>>::TYPE_CHECK;
+//     }
+// }
+
 /// A linear transformation of the form `weight * x + bias`, where `weight` is a matrix, `x` is a vector or matrix,
 /// and `bias` is a vector.
 ///
@@ -92,8 +145,12 @@ impl<const I: usize, const O: usize, E: Dtype, D: Device<E>> TensorCollection<E,
 
 impl<const I: usize, const O: usize, E: Dtype, D: Device<E>, T> Module<T> for Linear<I, O, E, D>
 where
-    T: SplitTape + TryMatMul<Tensor<Rank2<I, O>, E, D, T::Tape>> + HasErr<Err = D::Err>,
+    T: SplitTape
+        + TryStaticMatMul<Tensor<Rank2<I, O>, E, D, T::Tape>>
+        + HasErr<Err = D::Err>
+        + HasShape,
     T::Tape: Tape<E, D>,
+    T::Shape: AssertLayerMatch<Rank2<I, O>>,
     for<'a> Bias1D<'a, O, E, D>: Module<T::Output, Output = T::Output, Error = D::Err>,
 {
     type Output = T::Output;
@@ -101,6 +158,7 @@ where
 
     /// 1d forward using [matmul()] and [add()].
     fn try_forward(&self, x: T) -> Result<Self::Output, D::Err> {
+        x.shape().assert_dim_eq();
         let o = x.try_matmul(self.weight.retaped::<T::Tape>().try_permute()?)?;
         Bias1D { beta: &self.bias }.try_forward(o)
     }
diff --git a/src/nn/transformer/mha.rs b/src/nn/transformer/mha.rs
index a8ffbc952..3433cc2b9 100644
--- a/src/nn/transformer/mha.rs
+++ b/src/nn/transformer/mha.rs
@@ -127,11 +127,11 @@ where
 
         // Get weights
         let scalar: E = E::ONE / E::from_usize(K / H).unwrap().sqrt();
-        let weights = q.try_matmul(k)?.try_mul(scalar)?;
+        let weights = q.try_dynamic_matmul(k)?.try_mul(scalar)?;
         let weights = weights.try_softmax::<Axis<2>>()?;
 
         // Get new tokens
-        let tokens = weights.try_matmul(v)?;
+        let tokens = weights.try_dynamic_matmul(v)?;
         let tokens = tokens.try_permute::<_, Axes3<1, 0, 2>>()?;
         let tokens = tokens.try_reshape_like(&(s1, Const::<V>)).unwrap()?;
 
@@ -187,11 +187,11 @@ where
 
         // Get weights
         let scalar: E = E::ONE / E::from_usize(K / H).unwrap().sqrt();
-        let weights = q.try_matmul(k)?.try_mul(scalar)?;
+        let weights = q.try_dynamic_matmul(k)?.try_mul(scalar)?;
         let weights = weights.try_softmax::<Axis<3>>()?;
 
         // Get new tokens
-        let tokens = weights.try_matmul(v)?;
+        let tokens = weights.try_dynamic_matmul(v)?;
         let tokens = tokens.try_permute::<_, Axes4<0, 2, 1, 3>>()?;
         let tokens = tokens.try_reshape_like(&(b, s1, Const::<V>)).unwrap()?;
 
diff --git a/src/nn/unbiased_linear.rs b/src/nn/unbiased_linear.rs
index 50f1b120a..ce494356e 100644
--- a/src/nn/unbiased_linear.rs
+++ b/src/nn/unbiased_linear.rs
@@ -79,7 +79,7 @@ impl<const I: usize, const O: usize, E: Dtype + Float + SampleUniform, D: Device
 impl<const I: usize, const O: usize, E: Dtype, D: Device<E>, T> Module<T>
     for UnbiasedLinear<I, O, E, D>
 where
-    T: SplitTape + TryMatMul<Tensor<Rank2<I, O>, E, D, T::Tape>> + HasErr<Err = D::Err>,
+    T: SplitTape + TryStaticMatMul<Tensor<Rank2<I, O>, E, D, T::Tape>> + HasErr<Err = D::Err>,
     T::Tape: Tape<E, D>,
 {
     type Output = T::Output;
diff --git a/src/tensor_ops/matmul/cpu_kernel.rs b/src/tensor_ops/matmul/cpu_kernel.rs
index e8e235392..800bca9db 100644
--- a/src/tensor_ops/matmul/cpu_kernel.rs
+++ b/src/tensor_ops/matmul/cpu_kernel.rs
@@ -17,6 +17,8 @@ use cblas_sys::{
 ))]
 use matrixmultiply::{dgemm, sgemm};
 
+use super::MulStaticDimCheck;
+
 #[cfg(not(any(
     feature = "cpu-seq-matmul",
     feature = "cpu-par-matmul",
@@ -221,10 +223,10 @@ impl<E: Dtype> super::VecMatKernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn forward<K: Dim, N: Dim>(
+    fn forward<LeftK: Dim + MulStaticDimCheck<(RightK, N)>, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(K,), E, Self>,
-        rhs: &Tensor<(K, N), E, Self>,
+        lhs: &Tensor<(LeftK,), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
     ) -> Result<Tensor<(N,), E, Self>, Self::Err> {
         let (k, n) = rhs.shape;
         let mut out = self.try_zeros_like(&(n,))?;
@@ -239,11 +241,11 @@ where
         );
         Ok(out)
     }
-    fn backward<K: Dim, N: Dim>(
+    fn backward<LeftK: Dim + MulStaticDimCheck<(RightK, N)>, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(K,), E, Self>,
+        lhs: &Tensor<(LeftK,), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(K, N), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
@@ -271,15 +273,16 @@ where
     }
 }
 
-impl<E: Dtype> super::MatMatKernel<E> for Cpu
+impl<E: Dtype> super::StaticMatMatKernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn forward<M: Dim, K: Dim, N: Dim>(
+    fn forward<M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(M, K), E, Self>,
-        rhs: &Tensor<(K, N), E, Self>,
+        lhs: &Tensor<(M, LeftK), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
     ) -> Result<Tensor<(M, N), E, Self>, Self::Err> {
+        // assert_eq!(lhs.shape.1.size(), rhs.shape.0.size());
         let (m, k) = lhs.shape;
         let n = rhs.shape.1;
         let mut out = self.try_zeros_like(&(m, n))?;
@@ -294,14 +297,15 @@ where
         );
         Ok(out)
     }
-    fn backward<M: Dim, K: Dim, N: Dim>(
+    fn backward<M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(M, K), E, Self>,
+        lhs: &Tensor<(M, LeftK), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(K, N), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
+        // assert_eq!(lhs.shape.1.size(), rhs.shape.0.size());
         let (m, k) = lhs.shape;
         let n = rhs.shape.1;
         let strides = (m, n).strides();
@@ -331,10 +335,10 @@ impl<E: Dtype> super::MatMatBrKernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn forward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn forward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
-        rhs: &Tensor<(K, N), E, Self>,
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
     ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err> {
         let (batch, m, k) = lhs.shape;
         let n = rhs.shape.1;
@@ -353,11 +357,11 @@ where
         }
         Ok(out)
     }
-    fn backward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn backward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(K, N), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
@@ -388,14 +392,14 @@ where
     }
 }
 
-impl<E: Dtype> super::MatMatBatch3Kernel<E> for Cpu
+impl<E: Dtype> super::StaticMatMatBatch3Kernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn forward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn forward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
-        rhs: &Tensor<(B, K, N), E, Self>,
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
+        rhs: &Tensor<(B, RightK, N), E, Self>,
     ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err> {
         let (b, m, k) = lhs.shape;
         let n = rhs.shape.2;
@@ -416,11 +420,11 @@ where
         }
         Ok(out)
     }
-    fn backward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn backward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(B, K, N), E, Self>,
+        rhs: &Tensor<(B, RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
@@ -451,14 +455,140 @@ where
     }
 }
 
-impl<E: Dtype> super::MatMatBatch4Kernel<E> for Cpu
+impl<E: Dtype> super::DynamicMatMatBatch3Kernel<E> for Cpu
 where
     Self: MatMulImpl<E>,
 {
-    fn forward<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim>(
+    fn forward<B: Dim, S1: Dim, S2: Dim>(
         &self,
-        lhs: &Tensor<(B, S, M, K), E, Self>,
-        rhs: &Tensor<(B, S, K, N), E, Self>,
+        lhs: &Tensor<(B, S1, usize), E, Self>,
+        rhs: &Tensor<(B, usize, S2), E, Self>,
+    ) -> Result<Tensor<(B, S1, S2), E, Self>, Self::Err> {
+        let (b, m, k) = lhs.shape;
+        let n = rhs.shape.2;
+        let mut out = self.try_zeros_like(&(b, m, n))?;
+        let ap = lhs.data.as_ref();
+        let bp = rhs.data.as_ref();
+        let cp = Arc::get_mut(&mut out.data).unwrap();
+        for i in 0..b.size() {
+            Self::matmul(
+                (m, k, n),
+                ap[i * lhs.strides[0]..].as_ptr(),
+                [lhs.strides[1], lhs.strides[2]],
+                bp[i * rhs.strides[0]..].as_ptr(),
+                [rhs.strides[1], rhs.strides[2]],
+                cp[i * out.strides[0]..].as_mut_ptr(),
+                [out.strides[1], out.strides[2]],
+            )
+        }
+        Ok(out)
+    }
+    fn backward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, S1, usize), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(B, usize, S2), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err> {
+        let (b, m, k) = lhs.shape;
+        let n = rhs.shape.2;
+        let strides = (b, m, n).strides();
+        for i in 0..b.size() {
+            Self::matmul(
+                (m, n, k),
+                grad_out[i * strides[0]..].as_ptr(),
+                [strides[1], strides[2]],
+                rhs.data[i * rhs.strides[0]..].as_ptr(),
+                [rhs.strides[2], rhs.strides[1]],
+                grad_lhs[i * lhs.strides[0]..].as_mut_ptr(),
+                [lhs.strides[1], lhs.strides[2]],
+            );
+            Self::matmul(
+                (k, m, n),
+                lhs.data[i * lhs.strides[0]..].as_ptr(),
+                [lhs.strides[2], lhs.strides[1]],
+                grad_out[i * strides[0]..].as_ptr(),
+                [strides[1], strides[2]],
+                grad_rhs[i * rhs.strides[0]..].as_mut_ptr(),
+                [rhs.strides[1], rhs.strides[2]],
+            );
+        }
+        Ok(())
+    }
+}
+
+impl<E: Dtype> super::DynamicMatMatBatch3Kernel1<E> for Cpu
+where
+    Self: MatMulImpl<E>,
+{
+    fn forward<S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(usize, S1, usize), E, Self>,
+        rhs: &Tensor<(usize, usize, S2), E, Self>,
+    ) -> Result<Tensor<(usize, S1, S2), E, Self>, Self::Err> {
+        let (b, m, k) = lhs.shape;
+        let n = rhs.shape.2;
+        let mut out = self.try_zeros_like(&(b, m, n))?;
+        let ap = lhs.data.as_ref();
+        let bp = rhs.data.as_ref();
+        let cp = Arc::get_mut(&mut out.data).unwrap();
+        for i in 0..b.size() {
+            Self::matmul(
+                (m, k, n),
+                ap[i * lhs.strides[0]..].as_ptr(),
+                [lhs.strides[1], lhs.strides[2]],
+                bp[i * rhs.strides[0]..].as_ptr(),
+                [rhs.strides[1], rhs.strides[2]],
+                cp[i * out.strides[0]..].as_mut_ptr(),
+                [out.strides[1], out.strides[2]],
+            )
+        }
+        Ok(out)
+    }
+    fn backward<S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(usize, S1, usize), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(usize, usize, S2), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err> {
+        let (b, m, k) = lhs.shape;
+        let n = rhs.shape.2;
+        let strides = (b, m, n).strides();
+        for i in 0..b.size() {
+            Self::matmul(
+                (m, n, k),
+                grad_out[i * strides[0]..].as_ptr(),
+                [strides[1], strides[2]],
+                rhs.data[i * rhs.strides[0]..].as_ptr(),
+                [rhs.strides[2], rhs.strides[1]],
+                grad_lhs[i * lhs.strides[0]..].as_mut_ptr(),
+                [lhs.strides[1], lhs.strides[2]],
+            );
+            Self::matmul(
+                (k, m, n),
+                lhs.data[i * lhs.strides[0]..].as_ptr(),
+                [lhs.strides[2], lhs.strides[1]],
+                grad_out[i * strides[0]..].as_ptr(),
+                [strides[1], strides[2]],
+                grad_rhs[i * rhs.strides[0]..].as_mut_ptr(),
+                [rhs.strides[1], rhs.strides[2]],
+            );
+        }
+        Ok(())
+    }
+}
+
+impl<E: Dtype> super::StaticMatMatBatch4Kernel<E> for Cpu
+where
+    Self: MatMulImpl<E>,
+{
+    fn forward<B: Dim, S: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
+        &self,
+        lhs: &Tensor<(B, S, M, LeftK), E, Self>,
+        rhs: &Tensor<(B, S, RightK, N), E, Self>,
     ) -> Result<Tensor<(B, S, M, N), E, Self>, Self::Err> {
         let (b, s, m, k) = lhs.shape;
         let n = rhs.shape.3;
@@ -479,11 +609,141 @@ where
         }
         Ok(out)
     }
-    fn backward<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim>(
+    fn backward<B: Dim, S: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
+        &self,
+        lhs: &Tensor<(B, S, M, LeftK), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(B, S, RightK, N), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err> {
+        let (b, s, m, k) = lhs.shape;
+        let n = rhs.shape.3;
+        let strides = (b, s, m, n).strides();
+        for i in 0..b.size() {
+            for j in 0..s.size() {
+                Self::matmul(
+                    (m, n, k),
+                    grad_out[i * strides[0] + j * strides[1]..].as_ptr(),
+                    [strides[2], strides[3]],
+                    rhs.data[i * rhs.strides[0] + j * rhs.strides[1]..].as_ptr(),
+                    [rhs.strides[3], rhs.strides[2]],
+                    grad_lhs[i * lhs.strides[0] + j * lhs.strides[1]..].as_mut_ptr(),
+                    [lhs.strides[2], lhs.strides[3]],
+                );
+                Self::matmul(
+                    (k, m, n),
+                    lhs.data[i * lhs.strides[0] + j * lhs.strides[1]..].as_ptr(),
+                    [lhs.strides[3], lhs.strides[2]],
+                    grad_out[i * strides[0] + j * strides[1]..].as_ptr(),
+                    [strides[2], strides[3]],
+                    grad_rhs[i * rhs.strides[0] + j * rhs.strides[1]..].as_mut_ptr(),
+                    [rhs.strides[2], rhs.strides[3]],
+                );
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<E: Dtype> super::DynamicMatMatBatch4Kernel<E> for Cpu
+where
+    Self: MatMulImpl<E>,
+{
+    fn forward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, usize), E, Self>,
+        rhs: &Tensor<(B, usize, usize, S2), E, Self>,
+    ) -> Result<Tensor<(B, usize, S1, S2), E, Self>, Self::Err> {
+        let (b, s, m, k) = lhs.shape;
+        let n = rhs.shape.3;
+        let mut out = self.try_zeros_like(&(b, s, m, n))?;
+        let cp = Arc::get_mut(&mut out.data).unwrap();
+        for i in 0..b.size() {
+            for j in 0..s.size() {
+                Self::matmul(
+                    (m, k, n),
+                    lhs.data[i * lhs.strides[0] + j * lhs.strides[1]..].as_ptr(),
+                    [lhs.strides[2], lhs.strides[3]],
+                    rhs.data[i * rhs.strides[0] + j * rhs.strides[1]..].as_ptr(),
+                    [rhs.strides[2], rhs.strides[3]],
+                    cp[i * out.strides[0] + j * out.strides[1]..].as_mut_ptr(),
+                    [out.strides[2], out.strides[3]],
+                );
+            }
+        }
+        Ok(out)
+    }
+    fn backward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, usize), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(B, usize, usize, S2), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err> {
+        let (b, s, m, k) = lhs.shape;
+        let n = rhs.shape.3;
+        let strides = (b, s, m, n).strides();
+        for i in 0..b.size() {
+            for j in 0..s.size() {
+                Self::matmul(
+                    (m, n, k),
+                    grad_out[i * strides[0] + j * strides[1]..].as_ptr(),
+                    [strides[2], strides[3]],
+                    rhs.data[i * rhs.strides[0] + j * rhs.strides[1]..].as_ptr(),
+                    [rhs.strides[3], rhs.strides[2]],
+                    grad_lhs[i * lhs.strides[0] + j * lhs.strides[1]..].as_mut_ptr(),
+                    [lhs.strides[2], lhs.strides[3]],
+                );
+                Self::matmul(
+                    (k, m, n),
+                    lhs.data[i * lhs.strides[0] + j * lhs.strides[1]..].as_ptr(),
+                    [lhs.strides[3], lhs.strides[2]],
+                    grad_out[i * strides[0] + j * strides[1]..].as_ptr(),
+                    [strides[2], strides[3]],
+                    grad_rhs[i * rhs.strides[0] + j * rhs.strides[1]..].as_mut_ptr(),
+                    [rhs.strides[2], rhs.strides[3]],
+                );
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<E: Dtype> super::DynamicMatMatBatch4Kernel1<E> for Cpu
+where
+    Self: MatMulImpl<E>,
+{
+    fn forward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, S2), E, Self>,
+        rhs: &Tensor<(B, usize, S2, usize), E, Self>,
+    ) -> Result<Tensor<(B, usize, S1, usize), E, Self>, Self::Err> {
+        let (b, s, m, k) = lhs.shape;
+        let n = rhs.shape.3;
+        let mut out = self.try_zeros_like(&(b, s, m, n))?;
+        let cp = Arc::get_mut(&mut out.data).unwrap();
+        for i in 0..b.size() {
+            for j in 0..s.size() {
+                Self::matmul(
+                    (m, k, n),
+                    lhs.data[i * lhs.strides[0] + j * lhs.strides[1]..].as_ptr(),
+                    [lhs.strides[2], lhs.strides[3]],
+                    rhs.data[i * rhs.strides[0] + j * rhs.strides[1]..].as_ptr(),
+                    [rhs.strides[2], rhs.strides[3]],
+                    cp[i * out.strides[0] + j * out.strides[1]..].as_mut_ptr(),
+                    [out.strides[2], out.strides[3]],
+                );
+            }
+        }
+        Ok(out)
+    }
+    fn backward<B: Dim, S1: Dim, S2: Dim>(
         &self,
-        lhs: &Tensor<(B, S, M, K), E, Self>,
+        lhs: &Tensor<(B, usize, S1, S2), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(B, S, K, N), E, Self>,
+        rhs: &Tensor<(B, usize, S2, usize), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
     ) -> Result<(), Self::Err> {
diff --git a/src/tensor_ops/matmul/mod.rs b/src/tensor_ops/matmul/mod.rs
index 7ed4669f9..480ed0bfb 100644
--- a/src/tensor_ops/matmul/mod.rs
+++ b/src/tensor_ops/matmul/mod.rs
@@ -6,6 +6,7 @@ pub(super) mod cpu_kernel;
 pub(super) mod cuda_kernel;
 
 use crate::{
+    prelude::{Const, Rank1},
     shapes::{Dim, Dtype, Shape},
     tensor::{DeviceStorage, HasErr, Merge, PutTape, SplitTape, Tape, Tensor},
 };
@@ -60,13 +61,13 @@ use crate::{
 ///
 pub fn matmul<Lhs, Rhs>(lhs: Lhs, rhs: Rhs) -> Lhs::Output
 where
-    Lhs: TryMatMul<Rhs>,
+    Lhs: TryStaticMatMul<Rhs>,
 {
     lhs.matmul(rhs)
 }
 
 /// Fallible matrix multiplication. See [matmul] for examples.
-pub trait TryMatMul<Rhs>: HasErr {
+pub trait TryStaticMatMul<Rhs>: HasErr {
     type Output;
     fn matmul(self, rhs: Rhs) -> Self::Output {
         self.try_matmul(rhs).unwrap()
@@ -74,6 +75,22 @@ pub trait TryMatMul<Rhs>: HasErr {
     fn try_matmul(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
 }
 
+pub trait TryDynamicMatMul<Rhs>: HasErr {
+    type Output;
+    fn matmul(self, rhs: Rhs) -> Self::Output {
+        self.try_dynamic_matmul(rhs).unwrap()
+    }
+    fn try_dynamic_matmul(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+}
+
+pub trait TryDynamicMatMul1<Rhs>: HasErr {
+    type Output;
+    fn matmul(self, rhs: Rhs) -> Self::Output {
+        self.try_dynamic1_matmul(rhs).unwrap()
+    }
+    fn try_dynamic1_matmul(self, rhs: Rhs) -> Result<Self::Output, Self::Err>;
+}
+
 #[rustfmt::skip]
 fn try_binary_op<
     Lhs: Shape,
@@ -106,6 +123,162 @@ fn try_binary_op<
     Ok(out.put_tape(tape))
 }
 
+pub trait MulStaticDimCheck<Rhs: Shape> {
+    const TYPE_CHECK: ();
+    fn assert_dim_eq(&self);
+}
+
+impl<const L: usize, const R: usize> MulStaticDimCheck<Rank1<R>> for Rank1<L> {
+    const TYPE_CHECK: () = assert!(
+        L == R,
+        "You are trying to multiply vectors whose dimensions don't match."
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as MulStaticDimCheck<Rank1<L>>>::TYPE_CHECK;
+    }
+}
+
+impl<const L: usize, const R: usize, N: Dim> MulStaticDimCheck<(Const<R>, N)> for Const<L> {
+    const TYPE_CHECK: () = assert!(
+        L == R,
+        "You are trying to multiply a vector to a matrix whose row dimension does not match the dimension of the vector."
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as MulStaticDimCheck<(Const<R>, N)>>::TYPE_CHECK;
+    }
+}
+
+impl<M: Dim, const L: usize, const R: usize, N: Dim> MulStaticDimCheck<(Const<R>, N)>
+    for (M, Const<L>)
+{
+    const TYPE_CHECK: () = assert!(
+        L == R,
+        "You are trying to multiply matrices where the column dimension of the first does not match the row dimension of the second."
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as MulStaticDimCheck<(Const<R>, N)>>::TYPE_CHECK;
+    }
+}
+
+// impl<const L: usize, const R: usize> MulDimCheck<(Const<R>, usize)> for (usize, Const<L>) {
+//     const TYPE_CHECK: () = assert!(
+//         L == R,
+//         "You are trying to multiply matrices where the column dimension of the first does not match the row dimension of the second."
+//     );
+//     fn assert_dim_eq(&self) {
+//         let _ = <Self as MulDimCheck<(Const<R>, usize)>>::TYPE_CHECK;
+//     }
+// }
+
+impl<B: Dim, M: Dim, const L: usize, const R: usize, N: Dim> MulStaticDimCheck<(Const<R>, N)>
+    for (B, M, Const<L>)
+{
+    const TYPE_CHECK: () = assert!(
+        L == R,
+        "You are trying to multiply a tensor of rank 3 to a matrix where the last dimension of the first does not match the first dimension of the second."
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as MulStaticDimCheck<(Const<R>, N)>>::TYPE_CHECK;
+    }
+}
+
+impl<B: Dim, M: Dim, const L: usize, const R: usize, N: Dim> MulStaticDimCheck<(B, Const<R>, N)>
+    for (B, M, Const<L>)
+{
+    const TYPE_CHECK: () = assert!(
+        L == R,
+        "You are trying to multiply two tensors of rank 3 for Batch3Mul where the last dimension of the first does not match the second dimension of the second."
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as MulStaticDimCheck<(B, Const<R>, M)>>::TYPE_CHECK;
+    }
+}
+
+// impl<const L: usize, const R: usize> MulDimCheck<(usize, Const<R>, usize)>
+//     for (usize, usize, Const<L>)
+// {
+//     const TYPE_CHECK: () = assert!(
+//         L == R,
+//         "You are trying to multiply two tensors of rank 3 for Batch3Mul where the last dimension of the first does not match the second dimension of the second."
+//     );
+//     fn assert_dim_eq(&self) {
+//         let _ = <Self as MulDimCheck<(usize, Const<R>, usize)>>::TYPE_CHECK;
+//     }
+// }
+
+impl<B: Dim, S: Dim, M: Dim, const L: usize, const R: usize, N: Dim>
+    MulStaticDimCheck<(B, S, Const<R>, N)> for (B, S, M, Const<L>)
+{
+    const TYPE_CHECK: () = assert!(
+        L == R,
+        "You are trying to multiply two tensors of rank 4 for Batch4Mul where the last dimension of the first does not match the second to last dimension of the second."
+    );
+    fn assert_dim_eq(&self) {
+        let _ = <Self as MulStaticDimCheck<(B, S, Const<R>, M)>>::TYPE_CHECK;
+    }
+}
+
+// impl<const L: usize, const R: usize> MulDimCheck<(usize, usize, Const<R>, usize)>
+//     for (usize, usize, usize, Const<L>)
+// {
+//     const TYPE_CHECK: () = assert!(
+//         L == R,
+//         "You are trying to multiply two tensors of rank 4 for Batch4Mul where the last dimension of the first does not match the second to last dimension of the second."
+//     );
+//     fn assert_dim_eq(&self) {
+//         let _ = <Self as MulDimCheck<(usize, usize, Const<R>, usize)>>::TYPE_CHECK;
+//     }
+// }
+
+pub trait MulDynamicDimCheck<Rhs: Shape> {
+    fn assert_dim_eq(&self, rhs: &Rhs);
+}
+
+impl<M: Dim, N: Dim> MulDynamicDimCheck<(usize, N)> for (M, usize) {
+    fn assert_dim_eq(&self, rhs: &(usize, N)) {
+        assert_eq!(self.1, rhs.0);
+    }
+}
+
+// impl<B: Dim, const L: usize, const R: usize> MulDynamicDimCheck<(B, usize, usize, Const<R>)>
+//     for (B, usize, Const<L>, usize)
+// {
+//     fn assert_dim_eq(&self, rhs: &(B, usize, usize, Const<R>)) {
+//         assert_eq!(self.3, rhs.2);
+//     }
+// }
+
+impl<B: Dim, S1: Dim, S2: Dim> MulDynamicDimCheck<(B, usize, S2)> for (B, S1, usize) {
+    fn assert_dim_eq(&self, rhs: &(B, usize, S2)) {
+        assert_eq!(self.2, rhs.1);
+    }
+}
+
+// impl<S1: Dim, S2: Dim> MulDynamicDimCheck<(usize, usize, S2)> for (usize, S1, usize) {
+//     fn assert_dim_eq(&self, rhs: &(usize, usize, S2)) {
+//         assert_eq!(self.0, rhs.0);
+//         assert_eq!(self.2, rhs.1);
+//     }
+// }
+
+// impl<S1: Dim, S2: Dim> MulDynamicDimCheck<(usize, usize, S2)> for (usize, S1, S2) {
+//     fn assert_dim_eq(&self, rhs: &(usize, usize, S2)) {
+//         assert_eq!(self.2, rhs.1);
+//     }
+// }
+
+impl<B: Dim, S1: Dim, S2: Dim> MulDynamicDimCheck<(B, usize, usize, S2)> for (B, usize, S1, usize) {
+    fn assert_dim_eq(&self, rhs: &(B, usize, usize, S2)) {
+        assert_eq!(self.3, rhs.2);
+    }
+}
+
+// impl<B: Dim, S1: Dim, S2: Dim> MulDynamicDimCheck<(B, usize, S2, usize)> for (B, usize, S1, S2) {
+//     fn assert_dim_eq(&self, rhs: &(B, usize, S1, S2)) {
+//         assert_eq!(self.1, rhs.1);
+//     }
+// }
+
 pub trait VecVecKernel<E: Dtype>: DeviceStorage {
     fn forward<M: Dim, N: Dim>(
         &self,
@@ -124,7 +297,7 @@ pub trait VecVecKernel<E: Dtype>: DeviceStorage {
 }
 
 impl<M: Dim, N: Dim, E: Dtype, D: VecVecKernel<E>, T: Tape<E, D> + Merge<R>, R: Tape<E, D>>
-    TryMatMul<Tensor<(N,), E, D, R>> for Tensor<(M,), E, D, T>
+    TryStaticMatMul<Tensor<(N,), E, D, R>> for Tensor<(M,), E, D, T>
 {
     type Output = Tensor<(M, N), E, D, T>;
     fn try_matmul(self, rhs: Tensor<(N,), E, D, R>) -> Result<Self::Output, Self::Err> {
@@ -133,54 +306,120 @@ impl<M: Dim, N: Dim, E: Dtype, D: VecVecKernel<E>, T: Tape<E, D> + Merge<R>, R:
 }
 
 pub trait VecMatKernel<E: Dtype>: DeviceStorage {
-    fn forward<K: Dim, N: Dim>(
+    fn forward<LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(K,), E, Self>,
-        rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(N,), E, Self>, Self::Err>;
+        lhs: &Tensor<(LeftK,), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
+    ) -> Result<Tensor<(N,), E, Self>, Self::Err>
+    where
+        LeftK: MulStaticDimCheck<(RightK, N)>;
 
-    fn backward<K: Dim, N: Dim>(
+    fn backward<LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(K,), E, Self>,
+        lhs: &Tensor<(LeftK,), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(K, N), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Self::Err>
+    where
+        LeftK: MulStaticDimCheck<(RightK, N)>;
 }
 
-impl<K: Dim, N: Dim, E: Dtype, D: VecMatKernel<E>, T: Tape<E, D> + Merge<R>, R: Tape<E, D>>
-    TryMatMul<Tensor<(K, N), E, D, R>> for Tensor<(K,), E, D, T>
+impl<
+        LeftK: Dim,
+        RightK: Dim,
+        N: Dim,
+        E: Dtype,
+        D: VecMatKernel<E>,
+        T: Tape<E, D> + Merge<R>,
+        R: Tape<E, D>,
+    > TryStaticMatMul<Tensor<(RightK, N), E, D, R>> for Tensor<(LeftK,), E, D, T>
+where
+    LeftK: MulStaticDimCheck<(RightK, N)>,
 {
     type Output = Tensor<(N,), E, D, T>;
-    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
-        assert_eq!(self.shape.0, rhs.shape.0);
+    fn try_matmul(self, rhs: Tensor<(RightK, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
         try_binary_op(self, rhs, D::forward, D::backward)
     }
 }
 
-pub trait MatMatKernel<E: Dtype>: DeviceStorage {
-    fn forward<M: Dim, K: Dim, N: Dim>(
+pub trait StaticMatMatKernel<E: Dtype>: DeviceStorage {
+    fn forward<M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(M, K), E, Self>,
-        rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(M, N), E, Self>, Self::Err>;
+        lhs: &Tensor<(M, LeftK), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
+    ) -> Result<Tensor<(M, N), E, Self>, Self::Err>
+    where
+        (M, LeftK): MulStaticDimCheck<(RightK, N)>;
 
-    fn backward<M: Dim, K: Dim, N: Dim>(
+    fn backward<M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(M, K), E, Self>,
+        lhs: &Tensor<(M, LeftK), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(K, N), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Self::Err>
+    where
+        (M, LeftK): MulStaticDimCheck<(RightK, N)>;
+}
+
+impl<M: Dim, LeftK: Dim, RightK: Dim, N: Dim, E: Dtype, D: StaticMatMatKernel<E>, T, R>
+    TryStaticMatMul<Tensor<(RightK, N), E, D, R>> for Tensor<(M, LeftK), E, D, T>
+where
+    T: Tape<E, D> + Merge<R>,
+    R: Tape<E, D>,
+    (M, LeftK): MulStaticDimCheck<(RightK, N)>,
+{
+    type Output = Tensor<(M, N), E, D, T>;
+    /// ```compile_fail
+    /// # use dfdx::prelude::*;
+    /// # let dev: Cpu = Default::default();
+    /// let x: Tensor<Rank2<3, 2>, f32, _> = dev.zeros();
+    /// let y: Tensor<Rank2<3, 4>, f32, _> = dev.zeros();
+    /// let _: Tensor<Rank2<3, 4>, f32, _> = x.try_matmul(y);
+    /// ```
+    fn try_matmul(self, rhs: Tensor<(RightK, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.1.size(), rhs.shape.0.size());
+        self.shape.assert_dim_eq();
+        // println!(
+        //     "Left {:?} Right {:?}",
+        //     self.shape.1.size(),
+        //     rhs.shape.0.size()
+        // );
+        try_binary_op(self, rhs, D::forward, D::backward)
+    }
 }
 
-impl<M: Dim, K: Dim, N: Dim, E: Dtype, D: MatMatKernel<E>, T, R> TryMatMul<Tensor<(K, N), E, D, R>>
-    for Tensor<(M, K), E, D, T>
+pub trait DynamicMatMatKernel<E: Dtype>: DeviceStorage {
+    fn forward<M: Dim, N: Dim>(
+        &self,
+        lhs: &Tensor<(M, usize), E, Self>,
+        rhs: &Tensor<(usize, N), E, Self>,
+    ) -> Result<Tensor<(M, N), E, Self>, Self::Err>
+    where
+        (M, usize): MulDynamicDimCheck<(usize, M)>;
+
+    fn backward<M: Dim, N: Dim>(
+        &self,
+        lhs: &Tensor<(M, usize), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(usize, N), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err>
+    where
+        (M, usize): MulDynamicDimCheck<(usize, N)>;
+}
+
+impl<M: Dim, N: Dim, E: Dtype, D: DynamicMatMatKernel<E>, T, R>
+    TryDynamicMatMul<Tensor<(usize, N), E, D, R>> for Tensor<(M, usize), E, D, T>
 where
     T: Tape<E, D> + Merge<R>,
     R: Tape<E, D>,
+    (M, usize): MulDynamicDimCheck<(usize, N)>,
 {
     type Output = Tensor<(M, N), E, D, T>;
     /// ```compile_fail
@@ -190,34 +429,48 @@ where
     /// let y: Tensor<Rank2<3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank2<3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
-        assert_eq!(self.shape.1, rhs.shape.0);
+    fn try_dynamic_matmul(
+        self,
+        rhs: Tensor<(usize, N), E, D, R>,
+    ) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.1.size(), rhs.shape.0.size());
+        self.shape.assert_dim_eq(&rhs.shape);
+        // println!(
+        //     "Left {:?} Right {:?}",
+        //     self.shape.1.size(),
+        //     rhs.shape.0.size()
+        // );
         try_binary_op(self, rhs, D::forward, D::backward)
     }
 }
 
 pub trait MatMatBrKernel<E: Dtype>: DeviceStorage {
-    fn forward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn forward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
-        rhs: &Tensor<(K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>;
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
+    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>
+    where
+        (B, M, LeftK): MulStaticDimCheck<(RightK, N)>;
 
-    fn backward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn backward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(K, N), E, Self>,
+        rhs: &Tensor<(RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Self::Err>
+    where
+        (B, M, LeftK): MulStaticDimCheck<(RightK, N)>;
 }
 
-impl<B: Dim, M: Dim, K: Dim, N: Dim, E: Dtype, D: MatMatBrKernel<E>, T, R>
-    TryMatMul<Tensor<(K, N), E, D, R>> for Tensor<(B, M, K), E, D, T>
+impl<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim, E: Dtype, D: MatMatBrKernel<E>, T, R>
+    TryStaticMatMul<Tensor<(RightK, N), E, D, R>> for Tensor<(B, M, LeftK), E, D, T>
 where
     T: Tape<E, D> + Merge<R>,
     R: Tape<E, D>,
+    (B, M, LeftK): MulStaticDimCheck<(RightK, N)>,
 {
     type Output = Tensor<(B, M, N), E, D, T>;
     /// ```compile_fail
@@ -227,35 +480,41 @@ where
     /// let y: Tensor<Rank2<3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
-        assert_eq!(self.shape.2, rhs.shape.0);
+    fn try_matmul(self, rhs: Tensor<(RightK, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.2, rhs.shape.0);
+        self.shape.assert_dim_eq();
         try_binary_op(self, rhs, D::forward, D::backward)
     }
 }
 
-pub trait MatMatBatch3Kernel<E: Dtype>: DeviceStorage {
-    fn forward<B: Dim, M: Dim, K: Dim, N: Dim>(
+pub trait StaticMatMatBatch3Kernel<E: Dtype>: DeviceStorage {
+    fn forward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
-        rhs: &Tensor<(B, K, N), E, Self>,
-    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>;
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
+        rhs: &Tensor<(B, RightK, N), E, Self>,
+    ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>
+    where
+        (B, M, LeftK): MulStaticDimCheck<(B, RightK, N)>;
 
-    fn backward<B: Dim, M: Dim, K: Dim, N: Dim>(
+    fn backward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
         &self,
-        lhs: &Tensor<(B, M, K), E, Self>,
+        lhs: &Tensor<(B, M, LeftK), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(B, K, N), E, Self>,
+        rhs: &Tensor<(B, RightK, N), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Self::Err>
+    where
+        (B, M, LeftK): MulStaticDimCheck<(B, RightK, N)>;
 }
 
-impl<B: Dim, M: Dim, K: Dim, N: Dim, E: Dtype, D, T, R> TryMatMul<Tensor<(B, K, N), E, D, R>>
-    for Tensor<(B, M, K), E, D, T>
+impl<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim, E: Dtype, D, T, R>
+    TryStaticMatMul<Tensor<(B, RightK, N), E, D, R>> for Tensor<(B, M, LeftK), E, D, T>
 where
-    D: MatMatBatch3Kernel<E>,
+    D: StaticMatMatBatch3Kernel<E>,
     T: Tape<E, D> + Merge<R>,
     R: Tape<E, D>,
+    (B, M, LeftK): MulStaticDimCheck<(B, RightK, N)>,
 {
     type Output = Tensor<(B, M, N), E, D, T>;
     /// ```compile_fail
@@ -265,36 +524,186 @@ where
     /// let y: Tensor<Rank3<1, 3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(B, K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
-        assert_eq!(self.shape.0, rhs.shape.0);
-        assert_eq!(self.shape.2, rhs.shape.1);
+    fn try_matmul(self, rhs: Tensor<(B, RightK, N), E, D, R>) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
+        // assert_eq!(self.shape.2, rhs.shape.1);
+        self.shape.assert_dim_eq();
         try_binary_op(self, rhs, D::forward, D::backward)
     }
 }
 
-pub trait MatMatBatch4Kernel<E: Dtype>: DeviceStorage {
-    fn forward<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim>(
+pub trait DynamicMatMatBatch3Kernel<E: Dtype>: DeviceStorage {
+    fn forward<B: Dim, S1: Dim, S2: Dim>(
         &self,
-        lhs: &Tensor<(B, S, M, K), E, Self>,
-        rhs: &Tensor<(B, S, K, N), E, Self>,
-    ) -> Result<Tensor<(B, S, M, N), E, Self>, Self::Err>;
+        lhs: &Tensor<(B, S1, usize), E, Self>,
+        rhs: &Tensor<(B, usize, S2), E, Self>,
+    ) -> Result<Tensor<(B, S1, S2), E, Self>, Self::Err>
+    where
+        (B, S1, usize): MulDynamicDimCheck<(B, usize, S2)>;
 
-    fn backward<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim>(
+    fn backward<B: Dim, S1: Dim, S2: Dim>(
         &self,
-        lhs: &Tensor<(B, S, M, K), E, Self>,
+        lhs: &Tensor<(B, S1, usize), E, Self>,
         grad_lhs: &mut Self::Vec<E>,
-        rhs: &Tensor<(B, S, K, N), E, Self>,
+        rhs: &Tensor<(B, usize, S2), E, Self>,
         grad_rhs: &mut Self::Vec<E>,
         grad_out: &Self::Vec<E>,
-    ) -> Result<(), Self::Err>;
+    ) -> Result<(), Self::Err>
+    where
+        (B, S1, usize): MulDynamicDimCheck<(B, usize, S2)>;
+}
+
+impl<B: Dim, S1: Dim, S2: Dim, E: Dtype, D, T, R> TryDynamicMatMul<Tensor<(B, usize, S2), E, D, R>>
+    for Tensor<(B, S1, usize), E, D, T>
+where
+    D: DynamicMatMatBatch3Kernel<E>,
+    T: Tape<E, D> + Merge<R>,
+    R: Tape<E, D>,
+    (B, S1, usize): MulDynamicDimCheck<(B, usize, S2)>,
+{
+    type Output = Tensor<(B, S1, S2), E, D, T>;
+    /// ```compile_fail
+    /// # use dfdx::prelude::*;
+    /// # let dev: Cpu = Default::default();
+    /// let x: Tensor<Rank3<1, 3, 2>, f32, _> = dev.zeros();
+    /// let y: Tensor<Rank3<1, 3, 4>, f32, _> = dev.zeros();
+    /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
+    /// ```
+    fn try_dynamic_matmul(
+        self,
+        rhs: Tensor<(B, usize, S2), E, D, R>,
+    ) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
+        // assert_eq!(self.shape.2, rhs.shape.1);
+        self.shape.assert_dim_eq(&rhs.shape);
+        try_binary_op(self, rhs, D::forward, D::backward)
+    }
+}
+
+pub trait DynamicMatMatBatch3Kernel1<E: Dtype>: DeviceStorage {
+    fn forward<S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(usize, S1, usize), E, Self>,
+        rhs: &Tensor<(usize, usize, S2), E, Self>,
+    ) -> Result<Tensor<(usize, S1, S2), E, Self>, Self::Err>
+    where
+        (usize, S1, usize): MulDynamicDimCheck<(usize, usize, S2)>;
+
+    fn backward<S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(usize, S1, usize), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(usize, usize, S2), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err>
+    where
+        (usize, S1, usize): MulDynamicDimCheck<(usize, usize, S2)>;
+}
+
+impl<S1: Dim, S2: Dim, E: Dtype, D, T, R> TryDynamicMatMul1<Tensor<(usize, usize, S2), E, D, R>>
+    for Tensor<(usize, S1, usize), E, D, T>
+where
+    D: DynamicMatMatBatch3Kernel1<E>,
+    T: Tape<E, D> + Merge<R>,
+    R: Tape<E, D>,
+    (usize, S1, usize): MulDynamicDimCheck<(usize, usize, S2)>,
+{
+    type Output = Tensor<(usize, S1, S2), E, D, T>;
+    /// ```compile_fail
+    /// # use dfdx::prelude::*;
+    /// # let dev: Cpu = Default::default();
+    /// let x: Tensor<Rank3<1, 3, 2>, f32, _> = dev.zeros();
+    /// let y: Tensor<Rank3<1, 3, 4>, f32, _> = dev.zeros();
+    /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
+    /// ```
+    fn try_dynamic1_matmul(
+        self,
+        rhs: Tensor<(usize, usize, S2), E, D, R>,
+    ) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
+        // assert_eq!(self.shape.2, rhs.shape.1);
+        self.shape.assert_dim_eq(&rhs.shape);
+        try_binary_op(self, rhs, D::forward, D::backward)
+    }
+}
+
+// pub trait MatMatBatch3Kernel<E: Dtype>: DeviceStorage {
+//     fn forward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
+//         &self,
+//         lhs: &Tensor<(B, M, LeftK), E, Self>,
+//         rhs: &Tensor<(B, RightK, N), E, Self>,
+//     ) -> Result<Tensor<(B, M, N), E, Self>, Self::Err>;
+//     // where
+//     //     (B, M, LeftK): MulStaticDimCheck<(B, RightK, N)>;
+
+//     fn backward<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
+//         &self,
+//         lhs: &Tensor<(B, M, LeftK), E, Self>,
+//         grad_lhs: &mut Self::Vec<E>,
+//         rhs: &Tensor<(B, RightK, N), E, Self>,
+//         grad_rhs: &mut Self::Vec<E>,
+//         grad_out: &Self::Vec<E>,
+//     ) -> Result<(), Self::Err>;
+//     // where
+//     //     (B, M, LeftK): MulStaticDimCheck<(B, RightK, N)>;
+// }
+
+// impl<B: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim, E: Dtype, D, T, R>
+//     TryDynamicMatMul<Tensor<(B, RightK, N), E, D, R>> for Tensor<(B, M, LeftK), E, D, T>
+// where
+//     D: MatMatBatch3Kernel<E>,
+//     T: Tape<E, D> + Merge<R>,
+//     R: Tape<E, D>,
+//     (B, M, LeftK): MulDynamicDimCheck<(B, RightK, N)>,
+// {
+//     type Output = Tensor<(B, M, N), E, D, T>;
+//     /// ```compile_fail
+//     /// # use dfdx::prelude::*;
+//     /// # let dev: Cpu = Default::default();
+//     /// let x: Tensor<Rank3<1, 3, 2>, f32, _> = dev.zeros();
+//     /// let y: Tensor<Rank3<1, 3, 4>, f32, _> = dev.zeros();
+//     /// let _: Tensor<Rank3<1, 3, 4>, f32, _> = x.try_matmul(y);
+//     /// ```
+//     fn try_dynamic_matmul(
+//         self,
+//         rhs: Tensor<(B, RightK, N), E, D, R>,
+//     ) -> Result<Self::Output, Self::Err> {
+//         // assert_eq!(self.shape.0, rhs.shape.0);
+//         // assert_eq!(self.shape.2, rhs.shape.1);
+//         // self.shape.assert_dim_eq();
+//         try_binary_op(self, rhs, D::forward, D::backward)
+//     }
+// }
+
+pub trait StaticMatMatBatch4Kernel<E: Dtype>: DeviceStorage {
+    fn forward<B: Dim, S: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
+        &self,
+        lhs: &Tensor<(B, S, M, LeftK), E, Self>,
+        rhs: &Tensor<(B, S, RightK, N), E, Self>,
+    ) -> Result<Tensor<(B, S, M, N), E, Self>, Self::Err>
+    where
+        (B, S, M, LeftK): MulStaticDimCheck<(B, S, RightK, N)>;
+
+    fn backward<B: Dim, S: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim>(
+        &self,
+        lhs: &Tensor<(B, S, M, LeftK), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(B, S, RightK, N), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err>
+    where
+        (B, S, M, LeftK): MulStaticDimCheck<(B, S, RightK, N)>;
 }
 
-impl<B: Dim, S: Dim, M: Dim, K: Dim, N: Dim, E: Dtype, D, T, R>
-    TryMatMul<Tensor<(B, S, K, N), E, D, R>> for Tensor<(B, S, M, K), E, D, T>
+impl<B: Dim, S: Dim, M: Dim, LeftK: Dim, RightK: Dim, N: Dim, E: Dtype, D, T, R>
+    TryStaticMatMul<Tensor<(B, S, RightK, N), E, D, R>> for Tensor<(B, S, M, LeftK), E, D, T>
 where
-    D: MatMatBatch4Kernel<E>,
+    D: StaticMatMatBatch4Kernel<E>,
     T: Tape<E, D> + Merge<R>,
     R: Tape<E, D>,
+    (B, S, M, LeftK): MulStaticDimCheck<(B, S, RightK, N)>,
 {
     type Output = Tensor<(B, S, M, N), E, D, T>;
     /// ```compile_fail
@@ -304,10 +713,114 @@ where
     /// let y: Tensor<Rank4<1, 5, 3, 4>, f32, _> = dev.zeros();
     /// let _: Tensor<Rank3<1, 5, 3, 4>, f32, _> = x.try_matmul(y);
     /// ```
-    fn try_matmul(self, rhs: Tensor<(B, S, K, N), E, D, R>) -> Result<Self::Output, Self::Err> {
-        assert_eq!(self.shape.0, rhs.shape.0);
-        assert_eq!(self.shape.1, rhs.shape.1);
-        assert_eq!(self.shape.3, rhs.shape.2);
+    fn try_matmul(
+        self,
+        rhs: Tensor<(B, S, RightK, N), E, D, R>,
+    ) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
+        // assert_eq!(self.shape.1, rhs.shape.1);
+        // assert_eq!(self.shape.3, rhs.shape.2);
+        self.shape.assert_dim_eq();
+        try_binary_op(self, rhs, D::forward, D::backward)
+    }
+}
+
+pub trait DynamicMatMatBatch4Kernel<E: Dtype>: DeviceStorage {
+    fn forward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, usize), E, Self>,
+        rhs: &Tensor<(B, usize, usize, S2), E, Self>,
+    ) -> Result<Tensor<(B, usize, S1, S2), E, Self>, Self::Err>;
+    // where
+    //     (usize, usize, usize, LeftK): MulDynamicDimCheck<(usize, usize, RightK, usize)>;
+
+    fn backward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, usize), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(B, usize, usize, S2), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err>;
+    // where
+    //     (usize, usize, usize, LeftK): MulDynamicDimCheck<(usize, usize, RightK, usize)>;
+}
+
+impl<B: Dim, S1: Dim, S2: Dim, E: Dtype, D, T, R>
+    TryDynamicMatMul<Tensor<(B, usize, usize, S2), E, D, R>>
+    for Tensor<(B, usize, S1, usize), E, D, T>
+where
+    D: DynamicMatMatBatch4Kernel<E>,
+    T: Tape<E, D> + Merge<R>,
+    R: Tape<E, D>,
+    // (usize, usize, usize, LeftK): MulDynamicDimCheck<(usize, usize, RightK, usize)>,
+{
+    type Output = Tensor<(B, usize, S1, S2), E, D, T>;
+    /// ```compile_fail
+    /// # use dfdx::prelude::*;
+    /// # let dev: Cpu = Default::default();
+    /// let x: Tensor<Rank4<1, 5, 3, 2>, f32, _> = dev.zeros();
+    /// let y: Tensor<Rank4<1, 5, 3, 4>, f32, _> = dev.zeros();
+    /// let _: Tensor<Rank3<1, 5, 3, 4>, f32, _> = x.try_matmul(y);
+    /// ```
+    fn try_dynamic_matmul(
+        self,
+        rhs: Tensor<(B, usize, usize, S2), E, D, R>,
+    ) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
+        // assert_eq!(self.shape.1, rhs.shape.1);
+        // assert_eq!(self.shape.3, rhs.shape.2);
+        self.shape.assert_dim_eq(&rhs.shape);
+        try_binary_op(self, rhs, D::forward, D::backward)
+    }
+}
+
+pub trait DynamicMatMatBatch4Kernel1<E: Dtype>: DeviceStorage {
+    fn forward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, S2), E, Self>,
+        rhs: &Tensor<(B, usize, S2, usize), E, Self>,
+    ) -> Result<Tensor<(B, usize, S1, usize), E, Self>, Self::Err>;
+    // where
+    //     (usize, usize, usize, LeftK): MulDynamicDimCheck<(usize, usize, RightK, usize)>;
+
+    fn backward<B: Dim, S1: Dim, S2: Dim>(
+        &self,
+        lhs: &Tensor<(B, usize, S1, S2), E, Self>,
+        grad_lhs: &mut Self::Vec<E>,
+        rhs: &Tensor<(B, usize, S2, usize), E, Self>,
+        grad_rhs: &mut Self::Vec<E>,
+        grad_out: &Self::Vec<E>,
+    ) -> Result<(), Self::Err>;
+    // where
+    //     (usize, usize, usize, LeftK): MulDynamicDimCheck<(usize, usize, RightK, usize)>;
+}
+
+impl<B: Dim, S1: Dim, S2: Dim, E: Dtype, D, T, R>
+    TryDynamicMatMul1<Tensor<(B, usize, S2, usize), E, D, R>>
+    for Tensor<(B, usize, S1, S2), E, D, T>
+where
+    D: DynamicMatMatBatch4Kernel1<E>,
+    T: Tape<E, D> + Merge<R>,
+    R: Tape<E, D>,
+    (B, usize, S1, S2): MulDynamicDimCheck<(B, usize, S2, usize)>,
+{
+    type Output = Tensor<(B, usize, S1, usize), E, D, T>;
+    /// ```compile_fail
+    /// # use dfdx::prelude::*;
+    /// # let dev: Cpu = Default::default();
+    /// let x: Tensor<Rank4<1, 5, 3, 2>, f32, _> = dev.zeros();
+    /// let y: Tensor<Rank4<1, 5, 3, 4>, f32, _> = dev.zeros();
+    /// let _: Tensor<Rank3<1, 5, 3, 4>, f32, _> = x.try_matmul(y);
+    /// ```
+    fn try_dynamic1_matmul(
+        self,
+        rhs: Tensor<(B, usize, S2, usize), E, D, R>,
+    ) -> Result<Self::Output, Self::Err> {
+        // assert_eq!(self.shape.0, rhs.shape.0);
+        // assert_eq!(self.shape.1, rhs.shape.1);
+        // assert_eq!(self.shape.3, rhs.shape.2);
+        self.shape.assert_dim_eq(&rhs.shape);
         try_binary_op(self, rhs, D::forward, D::backward)
     }
 }
@@ -350,14 +863,14 @@ mod tests {
         {
             let a: Tensor<Rank3<10, 5, 3>, TestDtype, _> = dev.zeros();
             let b: Tensor<Rank2<3, 2>, TestDtype, _> = dev.zeros();
-            let _: Tensor<Rank3<10, 5, 2>, TestDtype, _> = a.matmul(b);
+            let _: Tensor<Rank3<10, 5, 2>, TestDtype, _> = matmul::TryStaticMatMul::matmul(a, b);
         }
 
-        {
-            let a: Tensor<Rank3<10, 5, 3>, TestDtype, _> = dev.zeros();
-            let b: Tensor<Rank3<10, 3, 2>, TestDtype, _> = dev.zeros();
-            let _: Tensor<Rank3<10, 5, 2>, TestDtype, _> = a.matmul(b);
-        }
+        // {
+        //     let a: Tensor<Rank3<10, 5, 3>, TestDtype, _> = dev.zeros();
+        //     let b: Tensor<Rank3<10, 3, 2>, TestDtype, _> = dev.zeros();
+        //     let _: Tensor<Rank3<10, 5, 2>, TestDtype, _> = matmul::TryStaticMatMul::matmul(a, b);
+        // }
 
         {
             let a: Tensor<Rank4<10, 20, 5, 3>, TestDtype, _> = dev.zeros();
diff --git a/src/tensor_ops/mod.rs b/src/tensor_ops/mod.rs
index 8b99e110a..fbb606f2d 100644
--- a/src/tensor_ops/mod.rs
+++ b/src/tensor_ops/mod.rs
@@ -221,7 +221,7 @@ pub use huber_error::huber_error;
 pub use ln::ln;
 pub use log_softmax::log_softmax;
 pub use logsumexp_to::LogSumExpTo;
-pub use matmul::{matmul, TryMatMul};
+pub use matmul::{matmul, TryDynamicMatMul, TryDynamicMatMul1, TryStaticMatMul};
 pub use max_to::MaxTo;
 pub use maximum::maximum;
 pub use mean_to::MeanTo;
diff --git a/src/tensor_ops/utilities/device.rs b/src/tensor_ops/utilities/device.rs
index 09ed30dce..9f38c8e7e 100644
--- a/src/tensor_ops/utilities/device.rs
+++ b/src/tensor_ops/utilities/device.rs
@@ -42,11 +42,14 @@ pub trait Device<E: Dtype>:
 
     // matmuls
     + super::super::matmul::VecMatKernel<E>
-    + super::super::matmul::MatMatKernel<E>
+    + super::super::matmul::StaticMatMatKernel<E>
     + super::super::matmul::VecVecKernel<E>
     + super::super::matmul::MatMatBrKernel<E>
-    + super::super::matmul::MatMatBatch3Kernel<E>
-    + super::super::matmul::MatMatBatch4Kernel<E>
+    + super::super::matmul::StaticMatMatBatch3Kernel<E>
+    + super::super::matmul::DynamicMatMatBatch3Kernel<E>
+    + super::super::matmul::StaticMatMatBatch4Kernel<E>
+    + super::super::matmul::DynamicMatMatBatch4Kernel<E>
+    + super::super::matmul::DynamicMatMatBatch4Kernel1<E>
 
     // scalar arithmetic
     + UnaryKernel<super::super::add::ScalarAddKernelOp<E>, E>