Check N & K % 32 == 0; update UT

Xia-Weiwen · Xia-Weiwen · commit cd5380204f76 · 2025-07-14T15:30:06.000Z
diff --git a/test/quantization/test_dynamic_float8_linear_cpu.py b/test/quantization/test_dynamic_float8_linear_cpu.py
@@ -36,9 +36,8 @@ def __init__(self, K=64, N=32, bias=False):
 
     def example_inputs(self, batch_size=1, dtype=torch.float, device="cpu"):
         return (
-            torch.randn(
-                batch_size, self.linear1.in_features, dtype=dtype, device=device
-            ),
+            torch.rand(batch_size, self.linear1.in_features, dtype=dtype, device=device)
+            * 0.1,
         )
 
     def forward(self, x):
@@ -88,7 +87,7 @@ def test_dynamic_float8_linear_cpu(self, dtype, x_dim, bias, bs):
             )
             torch._dynamo.reset()  # may segfault without this
             y2 = torch.compile(m2, fullgraph=True, dynamic=True)(*example_inputs)
-            atol, rtol = 1e-6, 1e-6
+            atol, rtol = 1e-4, 1e-6
             if dtype == torch.bfloat16:
                 atol, rtol = 1.6e-2, 3e-3
             elif dtype == torch.half:
@@ -102,6 +101,56 @@ def test_dynamic_float8_linear_cpu(self, dtype, x_dim, bias, bs):
             assert torch.allclose(dqw1, dqw1_ref)
             assert torch.allclose(dqw2, dqw2_ref)
 
+    @unittest.skipIf(
+        "CPU" not in torch._C._dispatch_dump("torchao::float8_linear_cpu"),
+        reason="cpp kernels not built",
+    )
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
+    @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
+    @common_utils.parametrize("x_dim", [2, 3])
+    @common_utils.parametrize("bias", [True, False])
+    def test_dynamic_float8_linear_ref_cpu(self, dtype, x_dim, bias):
+        device = "cpu"
+        # the shape is not supported by cpp kernel, so the ref path will be used.
+        m = ToyLinearModel(120, 120, bias=bias).eval().to(dtype).to(device)
+        m2 = copy.deepcopy(m)
+        bs = 4
+        example_inputs = m.example_inputs(batch_size=bs, dtype=dtype, device=device)
+        if x_dim == 3:
+            example_inputs = (example_inputs[0].unsqueeze(0),)
+
+        with torch.no_grad():
+            quantize_(
+                m,
+                Float8DynamicActivationFloat8WeightConfig(
+                    granularity=PerRow(),
+                    layout=Float8DynamicActFloat8WeightCPULayout(),
+                ),
+            )
+            y, code = torch._inductor.utils.run_and_get_code(
+                torch.compile(m, fullgraph=True, dynamic=True),
+                *example_inputs,
+            )
+            # ensure the op is not in the code
+            assert "torch.ops.torchao.float8_linear_cpu.default" not in code[0]
+            quantize_(
+                m2,
+                Float8DynamicActivationFloat8WeightConfig(
+                    granularity=PerRow(),
+                    layout=PlainLayout(),
+                ),
+            )
+            torch._dynamo.reset()  # may segfault without this
+            y2 = torch.compile(m2, fullgraph=True, dynamic=True)(*example_inputs)
+            assert torch.allclose(y, y2)
+            # Test get_plain by dequantize()
+            dqw1 = m.linear1.weight.original_weight_tensor.dequantize()
+            dqw2 = m.linear2.weight.original_weight_tensor.dequantize()
+            dqw1_ref = m2.linear1.weight.original_weight_tensor.dequantize()
+            dqw2_ref = m2.linear2.weight.original_weight_tensor.dequantize()
+            assert torch.allclose(dqw1, dqw1_ref)
+            assert torch.allclose(dqw2, dqw2_ref)
+
 
 common_utils.instantiate_parametrized_tests(TestDynamicFloat8Linear)
 
diff --git a/torchao/csrc/cpu/float8_linear.cpp b/torchao/csrc/cpu/float8_linear.cpp
@@ -44,6 +44,7 @@ float8_linear_prepack_impl(
   int N = weight.size(0);
   int K = weight.size(1);
   int G = scales.size(1);
+  TORCH_CHECK(K % G == 0, "K should be divisible by num_groups");
   int group_size = K / G;
   int block_k = group_size > 128 ? 128 : group_size;
   while (K % block_k != 0) {
@@ -52,6 +53,7 @@ float8_linear_prepack_impl(
   TORCH_CHECK(block_k > 0 && block_k <= group_size,
               "Float8 linear CPU: Invalid block_k size, should be in (0, group_size]");
   constexpr int block_n = BLOCK_N;
+  TORCH_CHECK(N % block_n == 0, "N should be divisible by 32");
   int Nc = N / block_n;
   int Kc = K / block_k;
 
diff --git a/torchao/dtypes/floatx/dyn_float8_act_float8_wei_cpu_layout.py b/torchao/dtypes/floatx/dyn_float8_act_float8_wei_cpu_layout.py
@@ -84,7 +84,7 @@ def __tensor_unflatten__(
             tensor_data_dict["packed_weight"],
             tensor_data_dict["scales"],
         )
-        (_layout, transposed) = tensor_attributes
+        (transposed, _layout) = tensor_attributes
         return cls(packed_weight, scales, transposed, _layout)
 
     @classmethod
@@ -103,8 +103,9 @@ def from_plain(
             scale.unsqueeze_(-1)
         scale = scale.to(torch.float)
 
+        N = data.size(0)
         K = data.size(-1)
-        if K % 32 == 0:
+        if N % 32 == 0 and K % 32 == 0:
             # Pack weight from [N, K] to [N / block_n, K / block_k, block_k, block_n].
             # Pack inner blocks [block_k, block_n] to VNNI layout if AMX is available.
             # Pack scales from [N, num_groups] to [N / block_n, num_groups, block_n].
@@ -178,7 +179,7 @@ def block_size(self):
         return (1, group_size)
 
     def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        if self._layout == PlainLayout:
+        if isinstance(self._layout, PlainLayout):
             # If the layout is PlainLayout, return the packed weight and scales directly
             return (
                 self.packed_weight,