Skip to content

Commit

Permalink
2025-01-17 nightly release (21d1260)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Jan 17, 2025
1 parent 738ba45 commit 19abfe5
Show file tree
Hide file tree
Showing 17 changed files with 454 additions and 27 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ jobs:

- name: Upload Built Wheel as GHA Artifact
# Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
Expand Down Expand Up @@ -166,13 +166,13 @@ jobs:
steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

- name: Download Wheel Artifact from GHA
# Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_ci_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ jobs:

- name: Upload Built Wheel as GHA Artifact
# Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
Expand Down Expand Up @@ -165,13 +165,13 @@ jobs:
steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

- name: Download Wheel Artifact from GHA
# Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ jobs:

- name: Upload Built Wheel as GHA Artifact
# Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
Expand Down Expand Up @@ -167,7 +167,7 @@ jobs:

- name: Download Wheel Artifact from GHA
# Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Download Wheel Artifact from GHA
uses: actions/download-artifact@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ jobs:
steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_release_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ jobs:
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV release cuda

- name: Upload Built Wheel as GHA Artifact
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
Expand Down Expand Up @@ -151,10 +151,10 @@ jobs:

steps:
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Download Wheel Artifact from GHA
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_release_cuda_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_release_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ jobs:
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV release genai

- name: Upload Built Wheel as GHA Artifact
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_release_genai_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
Expand Down Expand Up @@ -151,10 +151,10 @@ jobs:

steps:
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Download Wheel Artifact from GHA
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_release_genai_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

Expand Down
34 changes: 34 additions & 0 deletions fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,40 @@ def cuda(self) -> bool:
return True


@register_quantize_op
class FP8LiteGemm(QuantizeOpBase):
"""
FP8 lite matmul for memory bound.
"""

def quantize(self, x, w):
# Quantize both input tensors.
xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
return xq, wq, x_scale, w_scale

def compute(self, xq, wq, x_scale, w_scale):
return torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)

def quantize_and_compute(self, x, w):
xq, wq, x_scale, w_scale = self.quantize(x, w)
return self.compute(xq, wq, x_scale * w_scale)

@property
def name(self) -> str:
return "cuda_lite"

@property
def hip(self) -> bool:
# Need to add support for better quantize kernel.
# Also may have an issue with cuda graphs.
return False

@property
def cuda(self) -> bool:
return True


@register_quantize_op
class TritonFP8RowwiseGemm(QuantizeOpBase):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ struct IntTupleHash {
static const std::unordered_map<std::tuple<int, int, int>, RowwiseKernel, IntTupleHash> rowwise_lookup_dispatch = {
// Support for decode for [1024, 5120]
{{16, 1024, 5120},
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2},
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8},
{{32, 1024, 5120},
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2},
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8},
{{64, 1024, 5120},
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2},
fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2},
{{128, 1024, 5120},
fp8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2},
fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2},
// Support for decode for [5120, 1024]
{{16, 5120, 1024},
fp8_rowwise_128x32x16x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2x2x1_1x1_interwave_v2},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "fp8_rowwise_common.h"

at::Tensor
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8(
at::Tensor XQ,
at::Tensor WQ,
at::Tensor x_scale,
at::Tensor w_scale,
at::Tensor Y) {
using DeviceGemmInstance = DeviceGemmHelper<
128,
16,
32,
128,
16,
16,
1,
1,
S<8, 16, 1>,
S<8, 16, 1>,
S<1, 16, 1, 8>,
S<4, 4, 1>,
1,
1,
ck::BlockGemmPipelineScheduler::Interwave,
ck::BlockGemmPipelineVersion::v2,
ck::tensor_operation::device::GemmSpecialization::Default>;
// Run kernel instance.
return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y, 8);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "fp8_rowwise_common.h"

at::Tensor
fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2(
at::Tensor XQ,
at::Tensor WQ,
at::Tensor x_scale,
at::Tensor w_scale,
at::Tensor Y) {
using DeviceGemmInstance = DeviceGemmHelper<
128,
16,
32,
512,
16,
16,
1,
1,
S<32, 4, 1>,
S<32, 4, 1>,
S<1, 16, 1, 8>,
S<4, 4, 1>,
1,
1,
ck::BlockGemmPipelineScheduler::Intrawave,
ck::BlockGemmPipelineVersion::v2,
ck::tensor_operation::device::GemmSpecialization::Default>;
// Run kernel instance.
return f8f8bf16_rowwise_impl<DeviceGemmInstance>(XQ, WQ, x_scale, w_scale, Y);
}
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ at::Tensor f8f8bf16_rowwise_impl(
at::Tensor WQ,
at::Tensor x_scale,
at::Tensor w_scale,
at::Tensor Y) {
at::Tensor Y,
int KBatch = 1) {
// Get input information.
int M = size_to_dim_(XQ.dim() - 1, XQ.sizes());
int N = WQ.size(0);
Expand Down Expand Up @@ -194,7 +195,7 @@ at::Tensor f8f8bf16_rowwise_impl(
StrideB,
std::array<ck::index_t, NumDTensor>{0, 0},
StrideE,
1,
KBatch,
a_element_op,
b_element_op,
cde_element_op);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,19 @@ fp8_rowwise_256x256x128x128_32x32_4x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave
at::Tensor x_scale,
at::Tensor w_scale,
at::Tensor Y);

at::Tensor
fp8_rowwise_128x16x32x128_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4x4x1_1x1_interwave_v2_8(
at::Tensor XQ,
at::Tensor WQ,
at::Tensor x_scale,
at::Tensor w_scale,
at::Tensor Y);

at::Tensor
fp8_rowwise_128x16x32x512_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4x4x1_1x1_intrawave_v2(
at::Tensor XQ,
at::Tensor WQ,
at::Tensor x_scale,
at::Tensor w_scale,
at::Tensor Y);
Loading

0 comments on commit 19abfe5

Please sign in to comment.