Merge branch 'main' into user_sched_segmentation_mapping

NVIDIA · Oct 30, 2024 · 946403a · 946403a
2 parents 8310ace + bad9e50
commit 946403a
Show file tree

Hide file tree

Showing 81 changed files with 2,107 additions and 849 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -195,6 +195,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_bcast_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
+  ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
   ${NVFUSER_SRCS_DIR}/rng.cpp
   ${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
   ${NVFUSER_SRCS_DIR}/runtime/executor.cpp

diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
@@ -104,6 +104,11 @@ def pytest_collection_modifyitems(session, config, items):
     run_thunder = config.getoption("--benchmark-thunder")
     run_torchcompile = config.getoption("--benchmark-torchcompile")
 
+    from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
+
+    for item in items:
+        item.obj = retry_on_oom_or_skip_test(item.obj)
+
     if not run_eager:
         skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run")
         for item in items:

diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 from nvfuser import FusionDefinition, DataType
 from .global_params import PROMOTE_DTYPES
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 import torch
 from .core import run_benchmark, unary_bwd_torch, clear_dynamo_cache
 import numpy as np
@@ -206,8 +206,6 @@ def norm_fwd_nvf_benchmark(
     Common benchmark setup for batchnorm/instance forward call in training mode.
     """
 
-    clear_cuda_cache()
-
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
 
     # Size is assumed to be in the order N, C, ...
@@ -293,8 +291,6 @@ def norm_bwd_nvf_benchmark(
     Common benchmark setup for batchnorm/instance forward call in training mode.
     """
 
-    clear_cuda_cache()
-
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
 
     # Size is assumed to be in the order N, C, ...
@@ -440,7 +436,6 @@ def norm_fwd_baseline_benchmark(
     compile: bool,
     norm: str,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
 
@@ -475,7 +470,6 @@ def norm_bwd_baseline_benchmark(
     compile: bool,
     norm: str,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
 

diff --git a/benchmarks/python/test_adaptive_layernorm_host.py b/benchmarks/python/test_adaptive_layernorm_host.py
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+import pytest
+from nvfuser import FusionDefinition, DataType
+from .core import run_benchmark
+import torch
+
+
+def adaptive_layernorm_fwd_fusion(fd: FusionDefinition, eps: float = 1e-6) -> None:
+    T0 = fd.define_tensor(
+        shape=[-1, -1, -1],
+        contiguity=[True, True, True],
+        dtype=DataType.Half,
+        is_cpu=False,
+        stride_order=[2, 1, 0],
+    )
+    T1 = fd.define_tensor(
+        shape=[-1, -1],
+        contiguity=[True, True],
+        dtype=DataType.Half,
+        is_cpu=False,
+        stride_order=[1, 0],
+    )
+    T2 = fd.define_tensor(
+        shape=[-1, -1],
+        contiguity=[True, True],
+        dtype=DataType.Half,
+        is_cpu=False,
+        stride_order=[1, 0],
+    )
+    T3 = fd.ops.cast(T0, dtype=DataType.Float)
+    T4, T5 = fd.ops.var_mean(T3, dims=[2], correction=0, keepdim=False)
+    T10 = fd.ops.broadcast_in_dim(
+        T4, shape=[T0.size(0), T0.size(1), 1], broadcast_dims=[0, 1]
+    )
+    T15 = fd.ops.broadcast_in_dim(
+        T5, shape=[T0.size(0), T0.size(1), 1], broadcast_dims=[0, 1]
+    )
+    S16 = fd.define_scalar(eps, dtype=DataType.Double)
+    T17 = fd.ops.add(T10, S16)
+    T22 = fd.ops.broadcast_in_dim(T15, shape=T0.shape(), broadcast_dims=[0, 1, 2])
+    T23 = fd.ops.rsqrt(T17)
+    T24 = fd.ops.sub(T3, T22)
+    T29 = fd.ops.broadcast_in_dim(T23, shape=T0.shape(), broadcast_dims=[0, 1, 2])
+    T30 = fd.ops.mul(T24, T29)
+    T35 = fd.ops.reshape(T1, new_shape=[T1.size(0), 1, T1.size(1)])
+    T36 = fd.ops.cast(T35, dtype=DataType.Float)
+    S37 = fd.define_scalar(1.00000, dtype=DataType.Double)
+    T38 = fd.ops.add(S37, T36)
+    T39 = fd.ops.cast(T38, dtype=DataType.Half)
+    T44 = fd.ops.broadcast_in_dim(T39, shape=T0.shape(), broadcast_dims=[0, 1, 2])
+    T45 = fd.ops.cast(T44, dtype=DataType.Float)
+    T46 = fd.ops.mul(T30, T45)
+    T51 = fd.ops.reshape(T2, new_shape=[T2.size(0), 1, T2.size(1)])
+    T56 = fd.ops.broadcast_in_dim(T51, shape=T0.shape(), broadcast_dims=[0, 1, 2])
+    T57 = fd.ops.cast(T56, dtype=DataType.Float)
+    T58 = fd.ops.add(T46, T57)
+    T59 = fd.ops.cast(T58, dtype=DataType.Half)
+    fd.add_output(T5)
+    fd.add_output(T23)
+    fd.add_output(T59)
+
+
+# This benchmark is to particularly track nvFuser host overhead for shape
+# change (dynamic shape support) in the adapative layernorm case. Running a
+# new shape on this fusion without recompiling a new kernel can have significant overhead.
+@pytest.mark.parametrize("host_bench_mode", ["compile", "steady", "dynamic"])
+def test_adaptive_layernorm_fwd_benchmark(
+    benchmark,
+    host_bench_mode: str,
+    disable_validation: bool,
+    disable_benchmarking: bool,
+):
+    B = 1
+    T = 30 * 1024
+    D = 1024
+    inputs = [
+        torch.randn(B, T, D, device="cuda", dtype=torch.float16, requires_grad=True),
+        torch.randn(B, D, device="cuda", dtype=torch.float16, requires_grad=True),
+        torch.randn(B, D, device="cuda", dtype=torch.float16, requires_grad=True),
+    ]
+
+    # Generate multiple inputs to measure dynamic shape overhead.
+    if host_bench_mode == "dynamic":
+        inputs = []
+        for B in range(1, 3, 1):
+            for T in range(30 * 1024, 30 * 1024 + 5 * 128, 128):
+                inputs.append(
+                    [
+                        torch.randn(
+                            B,
+                            T,
+                            D,
+                            device="cuda",
+                            dtype=torch.float16,
+                            requires_grad=True,
+                        ),
+                        torch.randn(
+                            B, D, device="cuda", dtype=torch.float16, requires_grad=True
+                        ),
+                        torch.randn(
+                            B, D, device="cuda", dtype=torch.float16, requires_grad=True
+                        ),
+                    ]
+                )
+
+    with FusionDefinition() as fd:
+        adaptive_layernorm_fwd_fusion(fd)
+
+    def validate(input):
+        eps = 1e-6
+        in_tensor, scale, shift = input
+        norm_state = torch.nn.LayerNorm(D, elementwise_affine=False, eps=eps)
+        norm_out = norm_state(in_tensor)
+        mean = in_tensor.to(torch.float).mean(dim=-1)
+        variance = in_tensor.to(torch.float).var(dim=-1, unbiased=False)
+        invstd = (1.0 / torch.sqrt(variance + eps)).unsqueeze(-1)
+        eager_output = norm_out * (1 + scale.view(-1, 1, D)) + shift.view(-1, 1, D)
+        fd.validate(input, [mean, invstd, eager_output])
+
+    if not disable_validation:
+        if host_bench_mode == "dynamic":
+            # Run validate for all input sizes.
+            for input in inputs:
+                validate(input)
+        else:
+            validate(inputs)
+
+    if not disable_benchmarking:
+        run_benchmark(
+            benchmark,
+            None,
+            inputs,
+            device=f"host:{host_bench_mode}",
+            fusion_fn=adaptive_layernorm_fwd_fusion,
+        )
diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
@@ -65,8 +65,6 @@ def test_bcast_add_nvf_benchmark(
     disable_validation: bool,
     disable_benchmarking: bool,
 ):
-    clear_cuda_cache()
-
     bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda")
 
     input_shape = size if contiguous else (size[1], size[0])
@@ -105,7 +103,6 @@ def test_bcast_add_baseline_benchmark(
     contiguous: bool,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
     bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda")

diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import (
     run_benchmark,
     clear_dynamo_cache,
@@ -149,8 +149,6 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
     disable_benchmarking: bool,
     eps: float = 1e-5,
 ):
-    clear_cuda_cache()
-
     input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
@@ -200,7 +198,6 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
     dtype: torch.dtype,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
 

diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import (
     run_benchmark,
     clear_dynamo_cache,
@@ -111,7 +111,6 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
     disable_benchmarking: bool,
     eps: float = 1e-5,
 ):
-    clear_cuda_cache()
     inputs = [
         torch.randn(size, device="cuda", dtype=dtype),
         torch.randn(size, device="cuda", dtype=dtype),
@@ -170,7 +169,6 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
     dtype: torch.dtype,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
 

diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import (
     run_benchmark,
     clear_dynamo_cache,
@@ -135,8 +135,6 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
     disable_benchmarking: bool,
     eps: float = 1e-5,
 ):
-    clear_cuda_cache()
-
     input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
@@ -180,7 +178,6 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
     dtype: torch.dtype,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
     dropout_p = 0.2

diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import (
     run_benchmark,
     clear_dynamo_cache,
@@ -111,8 +111,6 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
     disable_benchmarking: bool,
     eps: float = 1e-5,
 ):
-    clear_cuda_cache()
-
     input1 = torch.randn(size, device="cuda", dtype=dtype)
     input2 = torch.randn(size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype)
@@ -156,7 +154,6 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
     dtype: torch.dtype,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
     dropout_p = 0.2

diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
@@ -71,8 +71,6 @@ def test_gelu_bwd_nvf_benchmark(
     disable_validation: bool,
     disable_benchmarking: bool,
 ):
-    clear_cuda_cache()
-
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
@@ -99,7 +97,6 @@ def test_gelu_bwd_baseline_benchmark(
     dtype: torch.dtype,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)

diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 from nvfuser import FusionDefinition, DataType
-from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
+from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache
 import torch
 from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
@@ -83,8 +83,6 @@ def test_gelu_bwd_reduction_nvf_benchmark(
     disable_validation: bool,
     disable_benchmarking: bool,
 ):
-    clear_cuda_cache()
-
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(*size, device="cuda", dtype=dtype)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
@@ -116,7 +114,6 @@ def test_gelu_bwd_reduction_baseline_benchmark(
     reduction_axis: int,
     compile: bool,
 ):
-    clear_cuda_cache()
     if compile:
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)