pytorch · jwfromm · Jan 6, 2025
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/profile_grouped_gemm.py b/fbgemm_gpu/experimental/gen_ai/bench/profile_grouped_gemm.py
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -469,7 +469,7 @@
    """

    def quantize_fixed_nk(self, x, w):
        group_size = len(x)
        m_values = [i.shape[0] for i in x]
        # Inputs for fixed nk mode must be contiguous, however in the benchmark
        # script they typically are not. Do a little special processing to make them
@@ -486,11 +486,6 @@
         # Apply quantization.
         xq, x_scale = quantize_fp8_row(xq)
         wq, w_scale = quantize_fp8_row(wq)
-        # View these unified tensors as lists of tensors.
-        xq = [x.squeeze() for x in xq.split(1, dim=0)]
-        wq = [w.squeeze() for w in wq.split(1, dim=0)]
-        x_scale = [xs.squeeze() for xs in x_scale.view(group_size, -1).split(1, dim=0)]
-        w_scale = [ws.squeeze() for ws in w_scale.view(group_size, -1).split(1, dim=0)]
 
         # Return processed tensors.
         return (
@@ -520,14 +515,13 @@
         m_values = None
         return xq, wq, x_scale, w_scale, m_values
 
-    def compute(self, xq, wq, x_scale, w_scale, m_values, kernel_name=None):
+    def compute(self, xq, wq, x_scale, w_scale, m_values):
         if m_values is None:
             return torch.ops.fbgemm.f8f8bf16_rowwise_grouped(
                 xq,
                 wq,
                 x_scale,
                 w_scale,
-                kernel_name=kernel_name,
             )
         else:
             return torch.ops.fbgemm.f8f8bf16_rowwise_grouped_dynamic(
@@ -536,7 +530,6 @@
                 x_scale,
                 w_scale,
                 zero_start_index_M=m_values,
-                kernel_name=kernel_name,
             )
 
     def quantize_and_compute(self, x, w):