Skip to content

Commit

Permalink
Merge branch 'main' into user_sched_segmentation_mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
rdspring1 authored Oct 30, 2024
2 parents 8310ace + bad9e50 commit 946403a
Show file tree
Hide file tree
Showing 81 changed files with 2,107 additions and 849 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/preseg_passes/remove_bcast_squeeze.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
${NVFUSER_SRCS_DIR}/rng.cpp
${NVFUSER_SRCS_DIR}/runtime/allocations.cpp
${NVFUSER_SRCS_DIR}/runtime/executor.cpp
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/python/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ def pytest_collection_modifyitems(session, config, items):
run_thunder = config.getoption("--benchmark-thunder")
run_torchcompile = config.getoption("--benchmark-torchcompile")

from nvfuser.pytorch_utils import retry_on_oom_or_skip_test

for item in items:
item.obj = retry_on_oom_or_skip_test(item.obj)

if not run_eager:
skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run")
for item in items:
Expand Down
8 changes: 1 addition & 7 deletions benchmarks/python/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
from nvfuser import FusionDefinition, DataType
from .global_params import PROMOTE_DTYPES
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
import torch
from .core import run_benchmark, unary_bwd_torch, clear_dynamo_cache
import numpy as np
Expand Down Expand Up @@ -206,8 +206,6 @@ def norm_fwd_nvf_benchmark(
Common benchmark setup for batchnorm/instance forward call in training mode.
"""

clear_cuda_cache()

assert norm in ["batch_norm", "instance_norm"], NotImplementedError

# Size is assumed to be in the order N, C, ...
Expand Down Expand Up @@ -293,8 +291,6 @@ def norm_bwd_nvf_benchmark(
Common benchmark setup for batchnorm/instance forward call in training mode.
"""

clear_cuda_cache()

assert norm in ["batch_norm", "instance_norm"], NotImplementedError

# Size is assumed to be in the order N, C, ...
Expand Down Expand Up @@ -440,7 +436,6 @@ def norm_fwd_baseline_benchmark(
compile: bool,
norm: str,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

Expand Down Expand Up @@ -475,7 +470,6 @@ def norm_bwd_baseline_benchmark(
compile: bool,
norm: str,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

Expand Down
137 changes: 137 additions & 0 deletions benchmarks/python/test_adaptive_layernorm_host.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from .core import run_benchmark
import torch


def adaptive_layernorm_fwd_fusion(fd: FusionDefinition, eps: float = 1e-6) -> None:
T0 = fd.define_tensor(
shape=[-1, -1, -1],
contiguity=[True, True, True],
dtype=DataType.Half,
is_cpu=False,
stride_order=[2, 1, 0],
)
T1 = fd.define_tensor(
shape=[-1, -1],
contiguity=[True, True],
dtype=DataType.Half,
is_cpu=False,
stride_order=[1, 0],
)
T2 = fd.define_tensor(
shape=[-1, -1],
contiguity=[True, True],
dtype=DataType.Half,
is_cpu=False,
stride_order=[1, 0],
)
T3 = fd.ops.cast(T0, dtype=DataType.Float)
T4, T5 = fd.ops.var_mean(T3, dims=[2], correction=0, keepdim=False)
T10 = fd.ops.broadcast_in_dim(
T4, shape=[T0.size(0), T0.size(1), 1], broadcast_dims=[0, 1]
)
T15 = fd.ops.broadcast_in_dim(
T5, shape=[T0.size(0), T0.size(1), 1], broadcast_dims=[0, 1]
)
S16 = fd.define_scalar(eps, dtype=DataType.Double)
T17 = fd.ops.add(T10, S16)
T22 = fd.ops.broadcast_in_dim(T15, shape=T0.shape(), broadcast_dims=[0, 1, 2])
T23 = fd.ops.rsqrt(T17)
T24 = fd.ops.sub(T3, T22)
T29 = fd.ops.broadcast_in_dim(T23, shape=T0.shape(), broadcast_dims=[0, 1, 2])
T30 = fd.ops.mul(T24, T29)
T35 = fd.ops.reshape(T1, new_shape=[T1.size(0), 1, T1.size(1)])
T36 = fd.ops.cast(T35, dtype=DataType.Float)
S37 = fd.define_scalar(1.00000, dtype=DataType.Double)
T38 = fd.ops.add(S37, T36)
T39 = fd.ops.cast(T38, dtype=DataType.Half)
T44 = fd.ops.broadcast_in_dim(T39, shape=T0.shape(), broadcast_dims=[0, 1, 2])
T45 = fd.ops.cast(T44, dtype=DataType.Float)
T46 = fd.ops.mul(T30, T45)
T51 = fd.ops.reshape(T2, new_shape=[T2.size(0), 1, T2.size(1)])
T56 = fd.ops.broadcast_in_dim(T51, shape=T0.shape(), broadcast_dims=[0, 1, 2])
T57 = fd.ops.cast(T56, dtype=DataType.Float)
T58 = fd.ops.add(T46, T57)
T59 = fd.ops.cast(T58, dtype=DataType.Half)
fd.add_output(T5)
fd.add_output(T23)
fd.add_output(T59)


# This benchmark is to particularly track nvFuser host overhead for shape
# change (dynamic shape support) in the adapative layernorm case. Running a
# new shape on this fusion without recompiling a new kernel can have significant overhead.
@pytest.mark.parametrize("host_bench_mode", ["compile", "steady", "dynamic"])
def test_adaptive_layernorm_fwd_benchmark(
benchmark,
host_bench_mode: str,
disable_validation: bool,
disable_benchmarking: bool,
):
B = 1
T = 30 * 1024
D = 1024
inputs = [
torch.randn(B, T, D, device="cuda", dtype=torch.float16, requires_grad=True),
torch.randn(B, D, device="cuda", dtype=torch.float16, requires_grad=True),
torch.randn(B, D, device="cuda", dtype=torch.float16, requires_grad=True),
]

# Generate multiple inputs to measure dynamic shape overhead.
if host_bench_mode == "dynamic":
inputs = []
for B in range(1, 3, 1):
for T in range(30 * 1024, 30 * 1024 + 5 * 128, 128):
inputs.append(
[
torch.randn(
B,
T,
D,
device="cuda",
dtype=torch.float16,
requires_grad=True,
),
torch.randn(
B, D, device="cuda", dtype=torch.float16, requires_grad=True
),
torch.randn(
B, D, device="cuda", dtype=torch.float16, requires_grad=True
),
]
)

with FusionDefinition() as fd:
adaptive_layernorm_fwd_fusion(fd)

def validate(input):
eps = 1e-6
in_tensor, scale, shift = input
norm_state = torch.nn.LayerNorm(D, elementwise_affine=False, eps=eps)
norm_out = norm_state(in_tensor)
mean = in_tensor.to(torch.float).mean(dim=-1)
variance = in_tensor.to(torch.float).var(dim=-1, unbiased=False)
invstd = (1.0 / torch.sqrt(variance + eps)).unsqueeze(-1)
eager_output = norm_out * (1 + scale.view(-1, 1, D)) + shift.view(-1, 1, D)
fd.validate(input, [mean, invstd, eager_output])

if not disable_validation:
if host_bench_mode == "dynamic":
# Run validate for all input sizes.
for input in inputs:
validate(input)
else:
validate(inputs)

if not disable_benchmarking:
run_benchmark(
benchmark,
None,
inputs,
device=f"host:{host_bench_mode}",
fusion_fn=adaptive_layernorm_fwd_fusion,
)
5 changes: 1 addition & 4 deletions benchmarks/python/test_broadcast_add_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -65,8 +65,6 @@ def test_bcast_add_nvf_benchmark(
disable_validation: bool,
disable_benchmarking: bool,
):
clear_cuda_cache()

bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda")

input_shape = size if contiguous else (size[1], size[0])
Expand Down Expand Up @@ -105,7 +103,6 @@ def test_bcast_add_baseline_benchmark(
contiguous: bool,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda")
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_dropout_layernorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import (
run_benchmark,
clear_dynamo_cache,
Expand Down Expand Up @@ -149,8 +149,6 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
disable_benchmarking: bool,
eps: float = 1e-5,
):
clear_cuda_cache()

input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(size, device="cuda", dtype=dtype)
Expand Down Expand Up @@ -200,7 +198,6 @@ def test_dropout_layernorm_bwd_baseline_benchmark(
dtype: torch.dtype,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

Expand Down
4 changes: 1 addition & 3 deletions benchmarks/python/test_dropout_layernorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import (
run_benchmark,
clear_dynamo_cache,
Expand Down Expand Up @@ -111,7 +111,6 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
disable_benchmarking: bool,
eps: float = 1e-5,
):
clear_cuda_cache()
inputs = [
torch.randn(size, device="cuda", dtype=dtype),
torch.randn(size, device="cuda", dtype=dtype),
Expand Down Expand Up @@ -170,7 +169,6 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
dtype: torch.dtype,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()

Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_dropout_rmsnorm_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import (
run_benchmark,
clear_dynamo_cache,
Expand Down Expand Up @@ -135,8 +135,6 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
disable_benchmarking: bool,
eps: float = 1e-5,
):
clear_cuda_cache()

input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
input2 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(size, device="cuda", dtype=dtype)
Expand Down Expand Up @@ -180,7 +178,6 @@ def test_dropout_rmsnorm_bwd_baseline_benchmark(
dtype: torch.dtype,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
dropout_p = 0.2
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_dropout_rmsnorm_fwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import (
run_benchmark,
clear_dynamo_cache,
Expand Down Expand Up @@ -111,8 +111,6 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
disable_benchmarking: bool,
eps: float = 1e-5,
):
clear_cuda_cache()

input1 = torch.randn(size, device="cuda", dtype=dtype)
input2 = torch.randn(size, device="cuda", dtype=dtype)
weights = torch.randn(size[1], device="cuda", dtype=dtype)
Expand Down Expand Up @@ -156,7 +154,6 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
dtype: torch.dtype,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
dropout_p = 0.2
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_gelu_bwd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -71,8 +71,6 @@ def test_gelu_bwd_nvf_benchmark(
disable_validation: bool,
disable_benchmarking: bool,
):
clear_cuda_cache()

inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(size, device="cuda", dtype=dtype)
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
Expand All @@ -99,7 +97,6 @@ def test_gelu_bwd_baseline_benchmark(
dtype: torch.dtype,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
Expand Down
5 changes: 1 addition & 4 deletions benchmarks/python/test_gelu_bwd_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: BSD-3-Clause
import pytest
from nvfuser import FusionDefinition, DataType
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, clear_cuda_cache
from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
from .core import run_benchmark, clear_dynamo_cache
import torch
from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
Expand Down Expand Up @@ -83,8 +83,6 @@ def test_gelu_bwd_reduction_nvf_benchmark(
disable_validation: bool,
disable_benchmarking: bool,
):
clear_cuda_cache()

inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
grads = torch.randn(*size, device="cuda", dtype=dtype)
bias = torch.ones(size[-1], device="cuda", dtype=dtype)
Expand Down Expand Up @@ -116,7 +114,6 @@ def test_gelu_bwd_reduction_baseline_benchmark(
reduction_axis: int,
compile: bool,
):
clear_cuda_cache()
if compile:
clear_dynamo_cache()
inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
Expand Down
Loading

0 comments on commit 946403a

Please sign in to comment.