2025-01-17 nightly release (21d1260)

pytorch · Jan 17, 2025 · 19abfe5 · 19abfe5
1 parent 738ba45
commit 19abfe5
Show file tree

Hide file tree

Showing 17 changed files with 454 additions and 27 deletions.
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -125,7 +125,7 @@ jobs:
 
     - name: Upload Built Wheel as GHA Artifact
       # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/*.whl
@@ -166,13 +166,13 @@ jobs:
     steps:
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
     - name: Download Wheel Artifact from GHA
       # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 

diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -125,7 +125,7 @@ jobs:
 
     - name: Upload Built Wheel as GHA Artifact
       # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/*.whl
@@ -165,13 +165,13 @@ jobs:
     steps:
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
       with:
         submodules: true
 
     - name: Download Wheel Artifact from GHA
       # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 

diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -116,7 +116,7 @@ jobs:
 
     - name: Upload Built Wheel as GHA Artifact
       # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/*.whl
@@ -167,7 +167,7 @@ jobs:
 
     - name: Download Wheel Artifact from GHA
       # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 

diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -166,7 +166,7 @@ jobs:
         git config --global --add safe.directory '*'
 
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Download Wheel Artifact from GHA
       uses: actions/download-artifact@v4

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -130,7 +130,7 @@ jobs:
     steps:
     # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
       uses: pytorch/test-infra/.github/actions/setup-nvidia@main

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -119,7 +119,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV release cuda
 
     - name: Upload Built Wheel as GHA Artifact
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/*.whl
@@ -151,10 +151,10 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 

diff --git a/.github/workflows/fbgemm_gpu_release_genai.yml b/.github/workflows/fbgemm_gpu_release_genai.yml
@@ -119,7 +119,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV release genai
 
     - name: Upload Built Wheel as GHA Artifact
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: fbgemm_gpu_release_genai_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
         path: fbgemm_gpu/dist/*.whl
@@ -151,10 +151,10 @@ jobs:
 
     steps:
     - name: Checkout the Repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Download Wheel Artifact from GHA
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
       with:
         name: fbgemm_gpu_release_genai_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
 

diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -716,6 +716,40 @@ def cuda(self) -> bool:
         return True
 
 
+@register_quantize_op
+class FP8LiteGemm(QuantizeOpBase):
+    """
+    FP8 lite matmul for memory bound.
+    """
+
+    def quantize(self, x, w):
+        # Quantize both input tensors.
+        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
+        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
+        return xq, wq, x_scale, w_scale
+
+    def compute(self, xq, wq, x_scale, w_scale):
+        return torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
+
+    def quantize_and_compute(self, x, w):
+        xq, wq, x_scale, w_scale = self.quantize(x, w)
+        return self.compute(xq, wq, x_scale * w_scale)
+
+    @property
+    def name(self) -> str:
+        return "cuda_lite"
+
+    @property
+    def hip(self) -> bool:
+        # Need to add support for better quantize kernel.
+        # Also may have an issue with cuda graphs.
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
+
 @register_quantize_op
 class TritonFP8RowwiseGemm(QuantizeOpBase):
     """

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/fp8_rowwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/fp8_rowwise_gemm.hip
@@ -40,13 +40,13 @@ struct IntTupleHash {
 static const std::unordered_map<std::tuple<int, int, int>, RowwiseKernel, IntTupleHash> rowwise_lookup_dispatch = {
     // Support for decode for [1024, 5120]
     {{16, 1024, 5120},
-     fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2},
+     fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8},
     {{32, 1024, 5120},
-     fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2},
+     fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8},
     {{64, 1024, 5120},
-     fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2},
+     fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2},
     {{128, 1024, 5120},
-     fp8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2},
+     fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2},
     // Support for decode for [5120, 1024]
     {{16, 5120, 1024},
      fp8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2},

diff --git a/...wwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8_split_k.hip b/...wwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8_split_k.hip
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fp8_rowwise_common.h"
+
+at::Tensor
+fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor Y) {
+  using DeviceGemmInstance = DeviceGemmHelper<
+      128,
+      16,
+      32,
+      128,
+      16,
+      16,
+      1,
+      1,
+      S<8, 16, 1>,
+      S<8, 16, 1>,
+      S<1, 16, 1, 8>,
+      S<4, 4, 1>,
+      1,
+      1,
+      ck::BlockGemmPipelineScheduler::Interwave,
+      ck::BlockGemmPipelineVersion::v2,
+      ck::tensor_operation::device::GemmSpecialization::Default>;
+  // Run kernel instance.
+  return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y, 8);
+}
diff --git a/...els/fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2.hip b/...els/fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2.hip
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "fp8_rowwise_common.h"
+
+at::Tensor
+fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor Y) {
+  using DeviceGemmInstance = DeviceGemmHelper<
+      128,
+      16,
+      32,
+      512,
+      16,
+      16,
+      1,
+      1,
+      S<32, 4, 1>,
+      S<32, 4, 1>,
+      S<1, 16, 1, 8>,
+      S<4, 4, 1>,
+      1,
+      1,
+      ck::BlockGemmPipelineScheduler::Intrawave,
+      ck::BlockGemmPipelineVersion::v2,
+      ck::tensor_operation::device::GemmSpecialization::Default>;
+  // Run kernel instance.
+  return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
+}
diff --git a/...u/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_common.h b/...u/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_common.h
@@ -160,7 +160,8 @@ at::Tensor f8f8bf16_rowwise_impl(
     at::Tensor WQ,
     at::Tensor x_scale,
     at::Tensor w_scale,
-    at::Tensor Y) {
+    at::Tensor Y,
+    int KBatch = 1) {
   // Get input information.
   int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
   int N = WQ.size(0);
@@ -194,7 +195,7 @@ at::Tensor f8f8bf16_rowwise_impl(
       StrideB,
       std::array<ck::index_t, NumDTensor>{0, 0},
       StrideE,
-      1,
+      KBatch,
       a_element_op,
       b_element_op,
       cde_element_op);

diff --git a/...ental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_kernel_manifest.h b/...ental/gen_ai/src/quantize/ck_extensions/fp8_rowwise/kernels/fp8_rowwise_kernel_manifest.h
@@ -305,3 +305,19 @@ fp8_rowwise_256x256x128x128_32x32_4x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave
     at::Tensor x_scale,
     at::Tensor w_scale,
     at::Tensor Y);
+
+at::Tensor
+fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor Y);
+
+at::Tensor
+fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2(
+    at::Tensor XQ,
+    at::Tensor WQ,
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor Y);