Merge branch 'main_perf' into tianxing/moe-gemm

ROCm · Dec 20, 2024 · cefc74e · cefc74e
2 parents 9a43c1c + 4a7afd2
commit cefc74e
Show file tree

Hide file tree

Showing 22 changed files with 2,209 additions and 512 deletions.
diff --git a/.github/workflows/amd_perf_kernel_Integration_tests.yml b/.github/workflows/amd_perf_kernel_Integration_tests.yml
@@ -49,7 +49,7 @@ jobs:
         id: set-matrix
         run: |
           if [ x"${{ github.repository }}" == x"ROCm/triton" ]; then
-            echo '::set-output name=matrix-HIP::[["self-hosted", "rocm.gfx90a"]]'
+            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx942"]]'
           else
             echo '::set-output name=matrix-HIP::[["ubuntu-latest"]]'
           fi
@@ -100,7 +100,7 @@ jobs:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation-AMD.outputs.matrix-HIP)}}
     container:
-      image: rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.4
+      image: rocm/pytorch:latest
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
     steps:
       - name: Checkout

diff --git a/.github/workflows/amd_perf_kernel_postmerge_tests.yml b/.github/workflows/amd_perf_kernel_postmerge_tests.yml
@@ -27,7 +27,7 @@ jobs:
         id: set-matrix
         run: |
           if [ x"${{ github.repository }}" == x"ROCm/triton" ]; then
-            echo '::set-output name=matrix-HIP::[["self-hosted", "rocm.gfx90a"]]'
+            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx942"]]'
           else
             echo '::set-output name=matrix-HIP::[["ubuntu-latest"]]'
           fi
@@ -41,7 +41,7 @@ jobs:
       matrix:
         runner: ${{fromJson(needs.Runner-Preparation-AMD.outputs.matrix-HIP)}}
     container:
-      image: rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2
+      image: rocm/pytorch:latest
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
     steps:
       - name: Checkout

diff --git a/python/perf-kernels/README.md b/python/perf-kernels/README.md
@@ -42,9 +42,28 @@ This script contains the Flash Attention kernel with the following support
 - Multi and Grouped Query attention
 - ALiBi bias
 - Matrix bias
+- Persistent kernels. Useful when the sequence lengths are up to a moderate length and especially when doing causal attention.
+- Int8 quantization
 
 These are currently supported for the forward kernel only.
 
+INT8 Quantization Support
+
+1. <em>q_descale</em>, <em>k_descale</em>, and <em>v_descale</em> provided:
+   - The first QK GEMM runs in INT8, then the output is dequantized to the specified <em>dtype</em>.
+   - The second PV GEMM runs in the specified <em>dtype</em>.
+
+2. <em>q_descale</em>, <em>k_descale</em>, <em>p_descale</em>, and <em>v_descale</em> provided:
+   - Both the first and second GEMM operations run in INT8.
+   - The results are dequantized to the specified <em>dtype</em> after both GEMMs.
+
+3. Only <em>k_descale</em> and <em>v_descale</em> provided:
+   - K and V are dequantized before the first and second GEMM operations, respectively.
+   - Both GEMMs run in the specified <em>dtype</em>.
+
+Note: The softmax operation is always performed in <em>fp32</em>.
+
+
 ## `06-attention-decode.py`
 
 This contains the Flash Decoding kernel.

diff --git a/python/perf-kernels/flash-attention.py b/python/perf-kernels/flash-attention.py