We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
# CUDA devices: # 0: NVIDIA RTX 6000 Ada Generation # torch version: 2.6.0a0+ecf3bae40a.nvInternal # nvfuser version: 0.2.24+git28ae834 import torch from nvfuser import FusionDefinition, DataType def nvfuser_fusion_id25(fd : FusionDefinition) -> None : T0 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0]) T1 = fd.define_tensor(shape=[1, 4, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0]) T2 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0]) T3 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0]) T4 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 0, 1]) T5 = fd.define_tensor(shape=[1, 4, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 0, 1]) T6 = fd.define_tensor(shape=[1, 28, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 0, 1]) T7 = fd.define_tensor(shape=[1, 4, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 0, 1]) T8 = fd.define_tensor(shape=[1, 4, 4096, 128], contiguity=[None, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0]) T9 = fd.define_tensor(shape=[1, 4096, 3584], contiguity=[None, True, True], dtype=DataType.Bool, is_cpu=False, stride_order=[2, 1, 0]) T10 = fd.define_tensor(shape=[1, 4096, 512], contiguity=[None, True, True], dtype=DataType.Bool, is_cpu=False, stride_order=[2, 1, 0]) T11 = fd.define_tensor(shape=[1, 4096, 512], contiguity=[None, True, True], dtype=DataType.Bool, is_cpu=False, stride_order=[2, 1, 0]) T18 = fd.ops.reshape(T0, new_shape=[1, 4, 7, 4096, 128]) T19 = fd.ops.cast(T18, dtype=DataType.Float) T20 = fd.ops.sum(T19, dims=[0, 2], keepdim=False, dtype=DataType.Null) T21 = fd.ops.cast(T20, dtype=DataType.BFloat16) T28 = fd.ops.broadcast_in_dim(T21, shape=[1, 4, 1, 4096, 128], broadcast_dims=[1, 3, 4]) T29 = fd.ops.cast(T28, dtype=DataType.Float) T30 = fd.ops.sum(T29, dims=[0, 2], keepdim=False, dtype=DataType.Null) T31 = fd.ops.cast(T30, dtype=DataType.BFloat16) T37 = fd.ops.broadcast_in_dim(T31, shape=[1, 4, 4096, 128], broadcast_dims=[1, 2, 3]) T38 = fd.ops.cast(T37, dtype=DataType.Float) T39 = fd.ops.cast(T1, dtype=DataType.Float) T40 = fd.ops.cast(T2, dtype=DataType.Float) T41 = fd.ops.add(T39, T38) T48 = fd.ops.reshape(T3, new_shape=[1, 4, 7, 4096, 128]) T49 = fd.ops.mul(T4, T40) T50 = fd.ops.mul(T5, T41) T51 = fd.ops.cast(T48, dtype=DataType.Float) T52 = fd.ops.cast(T49, dtype=DataType.BFloat16) T53 = fd.ops.cast(T50, dtype=DataType.BFloat16) T54 = fd.ops.sum(T51, dims=[0, 2], keepdim=False, dtype=DataType.Null) T70 = fd.ops.slice(T52, start_indices=[0, 0, 0, 0], end_indices=[1, 28, 4096, 64], strides=[1, 1, 1, 1], manual_normalization=0) T86 = fd.ops.slice(T53, start_indices=[0, 0, 0, 0], end_indices=[1, 4, 4096, 64], strides=[1, 1, 1, 1], manual_normalization=0) T87 = fd.ops.cast(T54, dtype=DataType.BFloat16) T88 = fd.ops.cast(T70, dtype=DataType.Float) T89 = fd.ops.cast(T86, dtype=DataType.Float) T96 = fd.ops.broadcast_in_dim(T87, shape=[1, 4, 1, 4096, 128], broadcast_dims=[1, 3, 4]) T97 = fd.ops.neg(T88) T98 = fd.ops.neg(T89) T99 = fd.ops.cast(T96, dtype=DataType.Float) T115 = fd.ops.slice(T52, start_indices=[0, 0, 0, 64], end_indices=[1, 28, 4096, 128], strides=[1, 1, 1, 1], manual_normalization=0) T116 = fd.ops.cast(T97, dtype=DataType.BFloat16) T132 = fd.ops.slice(T53, start_indices=[0, 0, 0, 64], end_indices=[1, 4, 4096, 128], strides=[1, 1, 1, 1], manual_normalization=0) T133 = fd.ops.cast(T98, dtype=DataType.BFloat16) T134 = fd.ops.sum(T99, dims=[0, 2], keepdim=False, dtype=DataType.Null) S135 = fd.define_scalar(0.00000, dtype=DataType.Double) T145 = fd.ops.pad(T115, [0, 64, 0, 0, 0, 0, 0, 0], S135) S146 = fd.define_scalar(0.00000, dtype=DataType.Double) T156 = fd.ops.pad(T116, [64, 0, 0, 0, 0, 0, 0, 0], S146) S157 = fd.define_scalar(0.00000, dtype=DataType.Double) T167 = fd.ops.pad(T132, [0, 64, 0, 0, 0, 0, 0, 0], S157) S168 = fd.define_scalar(0.00000, dtype=DataType.Double) T178 = fd.ops.pad(T133, [64, 0, 0, 0, 0, 0, 0, 0], S168) T179 = fd.ops.cast(T134, dtype=DataType.BFloat16) T180 = fd.ops.cast(T145, dtype=DataType.Float) T181 = fd.ops.cast(T156, dtype=DataType.Float) T182 = fd.ops.cast(T167, dtype=DataType.Float) T183 = fd.ops.cast(T178, dtype=DataType.Float) T189 = fd.ops.broadcast_in_dim(T179, shape=[1, 4, 4096, 128], broadcast_dims=[1, 2, 3]) T190 = fd.ops.mul(T6, T40) T191 = fd.ops.add(T181, T180) T192 = fd.ops.mul(T7, T41) T193 = fd.ops.add(T183, T182) T194 = fd.ops.cast(T189, dtype=DataType.Float) T195 = fd.ops.cast(T8, dtype=DataType.Float) T196 = fd.ops.add(T191, T190) T197 = fd.ops.add(T193, T192) T198 = fd.ops.add(T195, T194) T199 = fd.ops.cast(T196, dtype=DataType.BFloat16) T200 = fd.ops.cast(T197, dtype=DataType.BFloat16) T201 = fd.ops.cast(T198, dtype=DataType.BFloat16) T202 = fd.ops.permute(T199, dims=[0, 2, 1, 3]) T203 = fd.ops.permute(T200, dims=[0, 2, 1, 3]) T204 = fd.ops.permute(T201, dims=[0, 2, 1, 3]) T209 = fd.ops.reshape(T202, new_shape=[1, 4096, 3584]) T214 = fd.ops.reshape(T203, new_shape=[1, 4096, 512]) T219 = fd.ops.reshape(T204, new_shape=[1, 4096, 512]) T220 = fd.ops.cast(T209, dtype=DataType.Float) T221 = fd.ops.cast(T214, dtype=DataType.Float) T222 = fd.ops.cast(T219, dtype=DataType.Float) S223 = fd.define_scalar(1.11111, dtype=DataType.Double) T224 = fd.ops.mul(S223, T220) T225 = fd.ops.cast(T9, dtype=DataType.Float) S226 = fd.define_scalar(1.11111, dtype=DataType.Double) T227 = fd.ops.mul(S226, T221) T228 = fd.ops.cast(T10, dtype=DataType.Float) S229 = fd.define_scalar(1.11111, dtype=DataType.Double) T230 = fd.ops.mul(S229, T222) T231 = fd.ops.cast(T11, dtype=DataType.Float) T232 = fd.ops.mul(T225, T224) T233 = fd.ops.mul(T228, T227) T234 = fd.ops.mul(T231, T230) S235 = fd.define_scalar(4.00000, dtype=DataType.Double) T236 = fd.ops.mul(S235, T232) S237 = fd.define_scalar(4.00000, dtype=DataType.Double) T238 = fd.ops.mul(S237, T233) S239 = fd.define_scalar(4.00000, dtype=DataType.Double) T240 = fd.ops.mul(S239, T234) T241 = fd.ops.cast(T236, dtype=DataType.BFloat16) T242 = fd.ops.cast(T238, dtype=DataType.BFloat16) T243 = fd.ops.cast(T240, dtype=DataType.BFloat16) T247 = fd.ops.reshape(T209, new_shape=[4096, 3584]) T251 = fd.ops.reshape(T241, new_shape=[4096, 3584]) T255 = fd.ops.reshape(T214, new_shape=[4096, 512]) T259 = fd.ops.reshape(T242, new_shape=[4096, 512]) T263 = fd.ops.reshape(T219, new_shape=[4096, 512]) T267 = fd.ops.reshape(T243, new_shape=[4096, 512]) fd.add_output(T267) fd.add_output(T263) fd.add_output(T259) fd.add_output(T255) fd.add_output(T251) fd.add_output(T247) with FusionDefinition() as fd: nvfuser_fusion_id25(fd) inputs = [ torch.testing.make_tensor((1, 28, 4096, 128), dtype=torch.bfloat16, device='cuda:0'), torch.testing.make_tensor((1, 4, 4096, 128), dtype=torch.bfloat16, device='cuda:0'), torch.testing.make_tensor((1, 28, 4096, 128), dtype=torch.bfloat16, device='cuda:0'), torch.testing.make_tensor((1, 28, 4096, 128), dtype=torch.bfloat16, device='cuda:0'), torch.randn(14680064, dtype=torch.float32, device='cuda:0').as_strided((1, 28, 4096, 128), (14680064, 524288, 1, 4096)), torch.randn(2097152, dtype=torch.float32, device='cuda:0').as_strided((1, 4, 4096, 128), (2097152, 524288, 1, 4096)), torch.randn(14680064, dtype=torch.float32, device='cuda:0').as_strided((1, 28, 4096, 128), (14680064, 524288, 1, 4096)), torch.randn(2097152, dtype=torch.float32, device='cuda:0').as_strided((1, 4, 4096, 128), (2097152, 524288, 1, 4096)), torch.testing.make_tensor((1, 4, 4096, 128), dtype=torch.bfloat16, device='cuda:0'), torch.testing.make_tensor((1, 4096, 3584), dtype=torch.bool, device='cuda:0'), torch.testing.make_tensor((1, 4096, 512), dtype=torch.bool, device='cuda:0'), torch.testing.make_tensor((1, 4096, 512), dtype=torch.bool, device='cuda:0'), ] fd.execute(inputs)
The error only repros when launching the above script with CUDA_LAUNCH_BLOCKING=1 otherwise it seems to fail elsewhere.
CUDA_LAUNCH_BLOCKING=1
Potentially Related - #3701
The text was updated successfully, but these errors were encountered:
verified that this is a duplication of #3701 and is fixed after reverting the PR. I'm closing this one.
Sorry, something went wrong.
No branches or pull requests
The error only repros when launching the above script with
CUDA_LAUNCH_BLOCKING=1
otherwise it seems to fail elsewhere.Potentially Related - #3701
The text was updated successfully, but these errors were encountered: