Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: broken enzyme tests
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 22, 2024
1 parent 8459414 commit 99fc6ac
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 7 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,18 @@ Compat = "4.15.0"
CpuId = "0.3"
DispatchDoctor = "0.4.12"
Enzyme = "0.13.1"
EnzymeCore = "0.8"
EnzymeCore = "0.8.1"
FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
Hwloc = "3.2"
KernelAbstractions = "0.9.22"
KernelAbstractions = "0.9.27"
LinearAlgebra = "1.10"
LoopVectorization = "0.12.171"
LuxCore = "1"
MKL = "0.7"
MLDataDevices = "1.1.1"
Markdown = "1.10"
NNlib = "0.9.21"
NNlib = "0.9.24"
Octavian = "0.3.28"
Polyester = "0.7.15"
Random = "1.10"
Expand Down
2 changes: 1 addition & 1 deletion ext/LuxLibEnzymeExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ using Static: True

Utils.is_extension_loaded(::Val{:Enzyme}) = True()

end
end
5 changes: 2 additions & 3 deletions test/common_ops/dense_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,13 @@ end
end

@testitem "Enzyme.Forward patch: dense" tags=[:dense] setup=[SharedTestSetup] skip=:(using LuxTestUtils; !LuxTestUtils.ENZYME_TESTING_ENABLED) begin
using LuxLib, Random, LuxTestUtils, Enzyme
using LuxLib, Random, ForwardDiff, Enzyme

x = rand(Float32, 2, 2)

f(x) = sum(abs2, LuxLib.Impl.matmul(x, x))

# Just test that we don't crash
@test length(Enzyme.gradient(Forward, f, x)) == 4
@test only(Enzyme.gradient(Forward, f, x)) ForwardDiff.gradient(f, x)
end

@testitem "Enzyme rules for fused dense" tags=[:dense] setup=[SharedTestSetup] skip=:(using LuxTestUtils; !LuxTestUtils.ENZYME_TESTING_ENABLED) begin
Expand Down

3 comments on commit 99fc6ac

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/115662

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.0 -m "<description of version>" 99fc6ac3b46e8538b7c8d5a0d8c6e7f4b78eeff6
git push origin v1.3.0

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 99fc6ac Previous: a6c4a16 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7000 ns 5666 ns 1.24
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5874.5 ns 5667 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8250 ns 7062.5 ns 1.17
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5625 ns 5541.5 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 88896 ns 117778 ns 0.75
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 400425 ns 404275 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9958 ns 9937.5 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9708 ns 10041 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9875 ns 10291 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9979.5 ns 9875 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 370778 ns 544239 ns 0.68
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 665927 ns 11501326 ns 0.0579000195281831
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1249.5 ns 1416.5 ns 0.88
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3000 ns 1479 ns 2.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1959 ns 1625 ns 1.21
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1687.5 ns 1542 ns 1.09
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 13908 ns 21518 ns 0.65
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 30060 ns 29030 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 3959 ns 4250 ns 0.93
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4291 ns 4333 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 3875 ns 4313 ns 0.90
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4375 ns 4459 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 104640 ns 145904.5 ns 0.72
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 145602 ns 145511 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58042 ns 58625 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39708.5 ns 39750 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 40084 ns 40042 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82708 ns 83395.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 30831 ns 37436 ns 0.82
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79190 ns 80685.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2061042 ns 2046125 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2079750 ns 2077896 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2084916 ns 2083625.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2001229 ns 1999104 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 181552 ns 229936 ns 0.79
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1440455 ns 1490545 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 148042 ns 162312.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 148000 ns 164083 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 155708 ns 174959 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 176313 ns 153854 ns 1.15
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168318 ns 166305 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203247.5 ns 198262 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1122729.5 ns 1121458.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1119625 ns 1114979 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1125833 ns 1119209 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1123854.5 ns 1123521 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 539424 ns 696644 ns 0.77
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 912000 ns 1026480.5 ns 0.89
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4625 ns 4875 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5084 ns 4916 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6125 ns 5875 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4125 ns 5375 ns 0.77
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 60787 ns 92112 ns 0.66
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 67560 ns 69791 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 8875 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8584 ns 8917 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8667 ns 8959 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8417 ns 8625 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 418528 ns 596620 ns 0.70
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 384969 ns 389954 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17542 ns 18312 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17542 ns 18104.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20458 ns 20021 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18770.5 ns 17771 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 59728.5 ns 67875.5 ns 0.88
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76240 ns 77581 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224208 ns 235917 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219500 ns 212458 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221312.5 ns 213667 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213000 ns 225292 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 293183.5 ns 353373 ns 0.83
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 463935 ns 470510 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 708 ns 0.94
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 625 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 916 ns 959 ns 0.96
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 729.5 ns 0.86
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 13248 ns 20362 ns 0.65
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 30930 ns 32440 ns 0.95
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1459 ns 1375 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1458 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1417 ns 1459 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1417 ns 1375 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 92361 ns 125347.5 ns 0.74
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 136232 ns 135651 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7417 ns 7458 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 5292 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5416 ns 5458 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10375 ns 10416 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 18749 ns 24280.5 ns 0.77
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48581 ns 48481 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 231083 ns 256833 ns 0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 237166.5 ns 268834 ns 0.88
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 241042 ns 238167 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 255583 ns 213521 ns 1.20
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 154979 ns 190543 ns 0.81
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 646107 ns 644671.5 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 4083 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4084 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4084 ns 4083 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 19985 ns 23269 ns 0.86
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 46780 ns 48260 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16458 ns 16542 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16500 ns 16542 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16625 ns 16833 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16791 ns 16583 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 176107 ns 195985.5 ns 0.90
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 175202 ns 174616.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 511792 ns 511667 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 331959 ns 331875 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 332000 ns 332042 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 865083 ns 865458 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 116899.5 ns 113196 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 241233 ns 243182 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2275354 ns 2277833 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1753833 ns 1758208 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1758916 ns 1758041.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3193500 ns 3193625 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 203284.5 ns 242653 ns 0.84
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 738868 ns 741122 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7459 ns 6396 ns 1.17
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6854.5 ns 7021 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6895.5 ns 7583 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6459 ns 6084 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 84654 ns 90386 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 65201 ns 65841 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11604 ns 11812 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11125 ns 11729.5 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12083 ns 12250 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12021 ns 10125 ns 1.19
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 566453.5 ns 626387 ns 0.90
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 408354 ns 405759 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 542 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 20386 ns 23421 ns 0.87
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 47011 ns 46570 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2083 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2083 ns 2208 ns 0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2166 ns 2167 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2084 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 228468 ns 221475.5 ns 1.03
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 179272 ns 174101.5 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8250 ns 9041 ns 0.91
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8833 ns 9292 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9292 ns 10375 ns 0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8875 ns 9000 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 107454 ns 94379 ns 1.14
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 74891 ns 72281 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16812.5 ns 17375 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17750 ns 17729 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 19271 ns 19209 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17791.5 ns 17562.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 534728 ns 576225.5 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 378084 ns 378363 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 542 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 27220 ns 35667 ns 0.76
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 48461 ns 46061 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10021 ns 10687.5 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9125 ns 9083.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9584 ns 9750 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9729 ns 8666.5 ns 1.12
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 168737.5 ns 258995 ns 0.65
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 367733.5 ns 366948.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399000 ns 399292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 215542 ns 215291 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215541 ns 215292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756208 ns 756083 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 110802 ns 113061 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 76450 ns 74731 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1398875 ns 1407958 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 858375 ns 860333 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 861479 ns 860854 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2355542 ns 2357500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 178308 ns 211180.5 ns 0.84
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 321323 ns 323393 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7354 ns 7125 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7042 ns 7542 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8666.5 ns 9000 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7563 ns 7250.5 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 114410.5 ns 143379.5 ns 0.80
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 65791 ns 66420 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13354.5 ns 15250 ns 0.88
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13542 ns 14959 ns 0.91
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15667 ns 13687.5 ns 1.14
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14979 ns 12333.5 ns 1.21
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 689799.5 ns 942342 ns 0.73
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 423374 ns 425844 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25770.5 ns 24646 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25875 ns 28000 ns 0.92
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 29083 ns 26666 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 27854 ns 28334 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 168075.5 ns 199235 ns 0.84
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 114031 ns 114286.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 118417 ns 153084 ns 0.77
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 119041 ns 157166.5 ns 0.76
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 141458.5 ns 145958.5 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 155166 ns 153417 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 861211 ns 1075111 ns 0.80
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 582431 ns 585190.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74666 ns 76625 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75750 ns 76729 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 84875 ns 81229 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77084 ns 79750 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 169153 ns 206416.5 ns 0.82
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 126942 ns 129541 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 278291 ns 307729 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 305021 ns 294250 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 305833 ns 290520.5 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 287270.5 ns 291458 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 972909 ns 1105738.5 ns 0.88
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 695847 ns 696697 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 16917 ns 16875 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17000 ns 16500 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 18354.5 ns 18375 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16458 ns 17584 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 113778 ns 145532.5 ns 0.78
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 231482 ns 232517.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27604.5 ns 27125 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25875 ns 26750 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26958.5 ns 27208 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 28166.5 ns 26604 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 702837 ns 980431.5 ns 0.72
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 696858 ns 686517 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 10375 ns 11625 ns 0.89
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10875 ns 12250 ns 0.89
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13625 ns 13875 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 10625 ns 10458 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 112473.5 ns 123683.5 ns 0.91
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 236187.5 ns 236852 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21583 ns 22709 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22396 ns 22063 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22250 ns 23083 ns 0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22041 ns 21833 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 556668 ns 703893 ns 0.79
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 670387 ns 673557 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 65542 ns 64250 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 64437.5 ns 69208 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66333 ns 65937.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66167 ns 63250 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 96734 ns 107264.5 ns 0.90
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 232362 ns 232543 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 437459 ns 457334 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 479417 ns 450791 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 438167 ns 449333.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 498625 ns 488708 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 442769 ns 515904.5 ns 0.86
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 712032 ns 701456.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7562.5 ns 7333.5 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7625 ns 7750 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8125 ns 9208 ns 0.88
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7250 ns 6979 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 113892.5 ns 144382.5 ns 0.79
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 69331 ns 65051 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14334 ns 14354.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14500 ns 15459 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16562 ns 15000 ns 1.10
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 11709 ns 15604 ns 0.75
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 675585.5 ns 949171 ns 0.71
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 399579 ns 399874 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6158208 ns 6153958.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 3224959 ns 3225750 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 3225125 ns 3225687.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11921125 ns 11912750 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 347611.5 ns 350232.5 ns 0.99
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 322793 ns 320283 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19113166.5 ns 19165042 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 11081437.5 ns 11087125 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 11182250 ns 11132791 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36513062 ns 36531187.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1026355 ns 1015711 ns 1.01
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1162657.5 ns 1168797 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 958 ns 958 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 958 ns 1000 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1041 ns 1000 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1000 ns 917 ns 1.09
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 20341 ns 23879 ns 0.85
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 206602 ns 206962 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3708 ns 3667 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3666 ns 3750 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3750 ns 3709 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3709 ns 3667 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 243936 ns 284113 ns 0.86
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 622497 ns 623016 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8125 ns 8312.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8145.5 ns 8604.5 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10209 ns 10083 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7645.5 ns 8146 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 110001.5 ns 119881.5 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 64821 ns 71901 ns 0.90
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11417 ns 12166.5 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12146 ns 12145.5 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12625 ns 13313 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12083 ns 11395.5 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 533401.5 ns 642520 ns 0.83
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 351113 ns 357894 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 333 ns 291 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 20031 ns 22935 ns 0.87
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 47010 ns 46631 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2875 ns 2917 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2917 ns 2917 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3125 ns 3167 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3042 ns 2958 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 139419 ns 206899.5 ns 0.67
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 160172 ns 161012 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11708 ns 12500 ns 0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11208 ns 11354 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12917 ns 13083 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11708 ns 10958.5 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 52993 ns 121271 ns 0.44
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 232812 ns 233822 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20666.5 ns 20291.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20208 ns 21083 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22458 ns 22187.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21187.5 ns 20104.5 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 249123.5 ns 597659.5 ns 0.42
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 648996.5 ns 638656 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4458 ns 4417 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4417 ns 4416 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 20585 ns 24156 ns 0.85
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 48820 ns 47331 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16375 ns 16167 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16250 ns 16375 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16458 ns 16333 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16208 ns 16333 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 169722 ns 333657 ns 0.51
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 209702 ns 207757 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1958 ns 2125 ns 0.92
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 1958 ns 2125 ns 0.92
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2084 ns 2084 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2042 ns 2041 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 28203 ns 36462 ns 0.77
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 202342 ns 202982 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 17125 ns 17021 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 16791.5 ns 17625 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17542 ns 16667 ns 1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17209 ns 17083.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 147741 ns 296284 ns 0.50
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 682312 ns 684797 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59062 ns 59562.5 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 62416 ns 61667 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 61312.5 ns 61875 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 53875 ns 50958 ns 1.06
batchedmm(16, Bsize=512)/forward/GPU/CUDA 71192 ns 66679 ns 1.07
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 116711 ns 117392 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 202750.5 ns 190771 ns 1.06
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 98750 ns 149541 ns 0.66
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 118104 ns 116312.5 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 297958 ns 298166 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 170047 ns 219498 ns 0.77
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 616606 ns 614646 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 84208 ns 83166.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83646 ns 83395.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85166 ns 110041.5 ns 0.77
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 128334 ns 83020.5 ns 1.55
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 184384 ns 190710.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203702 ns 206032 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1889375 ns 1873645.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1916750 ns 1919416 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1919083 ns 1920792 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1899041 ns 1919291.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 379904 ns 533490 ns 0.71
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1068311 ns 1074210 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 18502 ns 21800 ns 0.85
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 41550.5 ns 43000 ns 0.97
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1750 ns 1792 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1791 ns 1875 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1834 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 145894.5 ns 256181.5 ns 0.57
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 181622 ns 182412 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8458 ns 8458 ns 1
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8937.5 ns 9958 ns 0.90
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11208.5 ns 11708 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8875 ns 7583 ns 1.17
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 51415 ns 119063.5 ns 0.43
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 232043 ns 234272.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9125 ns 9208 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8667 ns 9854 ns 0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10458.5 ns 9792 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9583 ns 8750 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 241818.5 ns 528065 ns 0.46
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 623402 ns 634101 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58604.5 ns 58208 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39333 ns 39375 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39792 ns 39959 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83417 ns 83291 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 32658 ns 39916.5 ns 0.82
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79585.5 ns 79101 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1931459 ns 1906833 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1973750 ns 1969916.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1980958.5 ns 1979458 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1884875 ns 1901458 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 152863 ns 221725 ns 0.69
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1040311 ns 1161491.5 ns 0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 418333 ns 417125 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 418709 ns 420562.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 422000 ns 422103.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 418583.5 ns 417979 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 94366 ns 210226 ns 0.45
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 281763 ns 283213 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 673562.5 ns 680083.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 753812.5 ns 675125 ns 1.12
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 769958 ns 672375 ns 1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 751938 ns 672542 ns 1.12
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 470483 ns 1049720 ns 0.45
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 903129 ns 908698.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3419645.5 ns 3405187.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3437875 ns 3449917 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3451375 ns 3463646 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3429042 ns 3430687 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 140481 ns 170640 ns 0.82
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 441684 ns 450759.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6220250 ns 6244167 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6224937 ns 6219417 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6214292 ns 6254812 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6141041.5 ns 6201688 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 620637 ns 1001354 ns 0.62
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1629761.5 ns 1637156.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 474958 ns 474833 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 253000 ns 253792 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 253292 ns 253584 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 901709 ns 901250 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 43146 ns 47396 ns 0.91
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 241942.5 ns 241892 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2271000 ns 2269791 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1763792 ns 1760416 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1760167 ns 1763687.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3188958 ns 3197937.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 200260 ns 271388 ns 0.74
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 764328 ns 765898 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58125 ns 58541 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39334 ns 39292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39750 ns 39792 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83375 ns 84166 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 23268 ns 28606 ns 0.81
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 74721 ns 73921 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2035750 ns 2031396 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2088417 ns 2088958.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090333 ns 2084000 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1963541 ns 1977812.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 155158 ns 235137 ns 0.66
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1195637.5 ns 1110895.5 ns 1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58625 ns 58667 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39834 ns 39833 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 40083 ns 40000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83042 ns 83291 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41354 ns 49806.5 ns 0.83
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 77975.5 ns 76691 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1927125 ns 1930083.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1971541.5 ns 1967645.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1976833 ns 1961750 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1885312.5 ns 1797166 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 164726 ns 240260.5 ns 0.69
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1051246 ns 929734.5 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 416 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 26436 ns 35036 ns 0.75
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 46511 ns 46470 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7333 ns 7584 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6500 ns 6875 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6917 ns 7458 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7834 ns 5916 ns 1.32
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 132779 ns 213960 ns 0.62
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 364088.5 ns 368994 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 30026 ns 33302 ns 0.90
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 40500 ns 36481 ns 1.11
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3250 ns 2959 ns 1.10
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2958 ns 3083 ns 0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 3042 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2792 ns 2625 ns 1.06
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 139460 ns 192793 ns 0.72
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 156362 ns 151232 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 453562 ns 420458.5 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 426854 ns 458333.5 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 424771 ns 443562.5 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 454396.5 ns 454625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128743 ns 138662 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 374513 ns 376564 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3812646 ns 3808250 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3818687.5 ns 3812458 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3824687.5 ns 3814333.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3809020.5 ns 3779687.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 467612 ns 712866 ns 0.66
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1414714 ns 1464519 ns 0.97
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49937813 ns 49902208 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 25988125 ns 26041000 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 26009646 ns 26000917 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97113375 ns 97099875 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1610536 ns 1600470 ns 1.01
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1049471 ns 1045150 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154792729.5 ns 154793291.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 89048958.5 ns 88667041.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 89207416 ns 89550541 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 294786708.5 ns 294974291.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6494841 ns 6495543 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5562936 ns 5606170 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 18916.5 ns 18750 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 15584 ns 15666.5 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 14667 ns 14167 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15896 ns 15270.5 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 13971 ns 20352.5 ns 0.69
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 27630 ns 25851 ns 1.07
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 11291 ns 11041 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 7458.5 ns 7833 ns 0.95
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 7750 ns 7958 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17520.5 ns 17083 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 101782 ns 261162.5 ns 0.39
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 148192 ns 148401.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 9541.5 ns 8375 ns 1.14
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9125.5 ns 9083 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10333 ns 10583 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8542 ns 7916.5 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 53666.5 ns 113294.5 ns 0.47
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 235372 ns 234072 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9541 ns 10521 ns 0.91
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10209 ns 10416.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10458 ns 10042 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10250 ns 9666.5 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 269358 ns 615911 ns 0.44
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 652326 ns 655506 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9812.5 ns 9625 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9250 ns 9833 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10812.5 ns 12042 ns 0.90
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9562.5 ns 8479 ns 1.13
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 53391 ns 120314 ns 0.44
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 71711 ns 71931 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 14333 ns 13083 ns 1.10
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 14083 ns 15021 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 15167 ns 14542 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16625 ns 13417 ns 1.24
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 251184.5 ns 587303 ns 0.43
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 344093 ns 344908.5 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 458 ns 459 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 458 ns 583 ns 0.79
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 583 ns 459 ns 1.27
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 27208 ns 34757 ns 0.78
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 203792 ns 201632 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 7333.5 ns 1.18
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8125 ns 9270.5 ns 0.88
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8604.5 ns 7833 ns 1.10
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8416.5 ns 7229.5 ns 1.16
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 147255 ns 231923.5 ns 0.63
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 656126 ns 657851 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16625 ns 15875 ns 1.05
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 14500 ns 14645.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 13354 ns 12167 ns 1.10
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10229 ns 10375 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 13896.5 ns 21214 ns 0.66
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 186472 ns 184672 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 31750 ns 31375 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32000 ns 32416 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32042 ns 32270.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 31833 ns 31541 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 110682.5 ns 276539 ns 0.40
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 592116 ns 588126 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 450209 ns 444792 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 445500 ns 484417 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 444167 ns 448792 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 462958 ns 443250 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 188096.5 ns 194813 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 367068.5 ns 367924 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3834209 ns 3843833 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3836666 ns 3831916.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3847459 ns 3838417 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3828250 ns 3835042 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 383846 ns 537386 ns 0.71
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1358354 ns 1358632 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 784152667 ns 784101083 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 416079687.5 ns 418358083 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 422584917 ns 418383604.5 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1509956229 ns 1504938187.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22771101.5 ns 22745060.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14743999 ns 14695345 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2524849666 ns 2524662875 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1511960000 ns 1518103167 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1536159417 ns 1524361625 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4778947333 ns 4741835375 ns 1.01
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 119521542 ns 366822106 ns 0.33
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87915389 ns 88277685 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 78208.5 ns 76417 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 80271 ns 76792 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 82708 ns 80333 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77334 ns 77208 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 93705 ns 206105.5 ns 0.45
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 118801 ns 118901 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 291334 ns 191562.5 ns 1.52
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 210333 ns 287750 ns 0.73
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 261874.5 ns 209417 ns 1.25
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 202208.5 ns 253812.5 ns 0.80
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 458544 ns 1033097.5 ns 0.44
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 662017 ns 658411 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 200217604 ns 200015521 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 103846750 ns 103790000.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 104247042 ns 104076875 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389363833 ns 389226000 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5840254.5 ns 5819295 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3591326 ns 3575713 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 620550500 ns 621801500 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 352840416.5 ns 353125646 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 353679646 ns 354434874.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1181355417 ns 1181638875 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26562043 ns 26630294 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22008202.5 ns 22185623 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 7167 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 5375 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5458 ns 5375 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 10500 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 20844 ns 27436 ns 0.76
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48671 ns 46631 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 245770.5 ns 212500 ns 1.16
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 243083 ns 220750 ns 1.10
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221208 ns 220458 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207979 ns 206104.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 137816.5 ns 220558 ns 0.62
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 523805 ns 523545 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8334 ns 10541.5 ns 0.79
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8166.5 ns 9541.5 ns 0.86
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11041 ns 10875 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9020.5 ns 8312 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 50777 ns 117824.5 ns 0.43
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 69381 ns 70451 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 7583.5 ns 1.17
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8583 ns 9792 ns 0.88
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8166 ns 8187.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10854.5 ns 7562.5 ns 1.44
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 245858 ns 515354.5 ns 0.48
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 312998.5 ns 318733 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 584 ns 459 ns 1.27
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 19411 ns 26054 ns 0.75
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 48630 ns 46610 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10333 ns 9083 ns 1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 11375 ns 9604 ns 1.18
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9770.5 ns 8958 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9708 ns 9166 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 120697 ns 252407.5 ns 0.48
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 388289 ns 388539 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 105500 ns 107458.5 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 85875 ns 84708 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 87000 ns 86000 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146333.5 ns 146750 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 16870 ns 23950.5 ns 0.70
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 190057 ns 191282 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 478500 ns 516625 ns 0.93
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 485458 ns 502312.5 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 481521 ns 478354.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 478833 ns 498167 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 117100 ns 232559 ns 0.50
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 608201.5 ns 606451 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5959 ns 5250 ns 1.14
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 6625 ns 6500 ns 1.02
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7479.5 ns 7749.5 ns 0.97
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 6229.5 ns 5687.5 ns 1.10
batchedmm(16, Bsize=32)/forward/GPU/CUDA 14736 ns 16126.5 ns 0.91
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 79970 ns 85781 ns 0.93
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 13500 ns 11625 ns 1.16
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 9750 ns 9917 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 10167 ns 10104.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17125 ns 16584 ns 1.03
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 109548 ns 215162.5 ns 0.51
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 366884 ns 378354 ns 0.97
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 40458 ns 38708 ns 1.05
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 50417 ns 51125 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 51354 ns 52146 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13667 ns 14417 ns 0.95
batchedmm(16, Bsize=128)/forward/GPU/CUDA 20278.5 ns 19504 ns 1.04
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 85591 ns 93401 ns 0.92
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 37250 ns 36334 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 29541 ns 28167 ns 1.05
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 29875 ns 28625 ns 1.04
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57562.5 ns 56895.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 119274.5 ns 190765 ns 0.63
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 395964 ns 410848.5 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1833 ns 1666.5 ns 1.10
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1667 ns 2000 ns 0.83
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2291 ns 2167 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 2041.5 ns 1667 ns 1.22
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 13524 ns 20338 ns 0.66
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 32690 ns 32440 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2167 ns 2042 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2145.5 ns 2375 ns 0.90
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2395.5 ns 2417 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2312.5 ns 2083 ns 1.11
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 89460.5 ns 202489 ns 0.44
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 136351 ns 136411 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6104 ns 6750 ns 0.90
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4708.5 ns 4833 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6187.5 ns 5896 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5874.5 ns 4916.5 ns 1.19
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 58659.5 ns 142403 ns 0.41
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 67281 ns 69051 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9083.5 ns 8395.5 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9000 ns 8625 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8709 ns 8542 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8292 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 386636 ns 858082 ns 0.45
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 384884 ns 388048.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56916 ns 56834 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 56833 ns 56916 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 56958 ns 56917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58291 ns 58291 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 29539 ns 37048 ns 0.80
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 203102.5 ns 204772 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 453791.5 ns 484583.5 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 466875 ns 475541.5 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 465666.5 ns 465562.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 436208 ns 445666 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 167893 ns 263380 ns 0.64
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 823238 ns 819218 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3327646 ns 3332458 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1773958 ns 1767958 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 1770208 ns 1766125 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6318167 ns 6295583.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 203665 ns 206330 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 213597.5 ns 212392 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11522375 ns 11495438 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 6550792 ns 6565688 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 6579708.5 ns 6570438 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21256687.5 ns 21167562.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 761872 ns 737845 ns 1.03
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1057191 ns 1062630 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6667 ns 4833 ns 1.38
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4917 ns 5583 ns 0.88
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7000 ns 7333 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5166 ns 4500 ns 1.15
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 57961.5 ns 136011 ns 0.43
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 56041 ns 56600 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 11458 ns 7125 ns 1.61
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8750 ns 7500 ns 1.17
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7541 ns 7541.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8625 ns 7292 ns 1.18
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 382208 ns 746443 ns 0.51
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 361754 ns 370888 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 126917 ns 155000 ns 0.82
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 102541 ns 124709 ns 0.82
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 101792 ns 98541 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 98333 ns 98709 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127201 ns 150159 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 206327 ns 204262 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2039750.5 ns 2031188 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2028645.5 ns 2031500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2040937.5 ns 2037125 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1948458 ns 2033000 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 443232 ns 697162 ns 0.64
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1211817 ns 1208931 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33542 ns 33209 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 34416 ns 34833 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 34583 ns 33042 ns 1.05
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 625 ns 541 ns 1.16
batchedmm(2, Bsize=4)/forward/GPU/CUDA 13510 ns 15393 ns 0.88
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 79871 ns 79290 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 3750 ns 2583 ns 1.45
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3209 ns 3083 ns 1.04
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3041 ns 3209 ns 0.95
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2333 ns 2125 ns 1.10
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 89708.5 ns 138753 ns 0.65
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 340203 ns 341213 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7209 ns 7250 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 5416 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5417 ns 5416 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 10458 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 29375 ns 36086 ns 0.81
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49300 ns 49460 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222374.5 ns 213395.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221270.5 ns 227750 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221458 ns 220792 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206500 ns 205667 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 159760 ns 240787.5 ns 0.66
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 572920.5 ns 569246 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 18490 ns 21637 ns 0.85
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 43450 ns 42161 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14667 ns 14625 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14666 ns 14750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14709 ns 14667 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14708 ns 14625 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 165588 ns 307620 ns 0.54
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 197842 ns 192746.5 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 130708 ns 100834 ns 1.30
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 101313 ns 118500 ns 0.85
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 105000.5 ns 101833 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 106666.5 ns 102417 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125911 ns 136873 ns 0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 204662 ns 205777 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1925042 ns 1916625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1928041 ns 1916542 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1930583 ns 1926979 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1855291 ns 1898334 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 429902 ns 683667 ns 0.63
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1148786.5 ns 1215256.5 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18166 ns 19000 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18979 ns 19000 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22458 ns 22250 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18125 ns 16916 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 63187.5 ns 107183.5 ns 0.59
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79155.5 ns 78581 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 252792 ns 217813 ns 1.16
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 261875 ns 222833 ns 1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219958 ns 217417 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217125 ns 216770.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 279978 ns 512086.5 ns 0.55
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 475684 ns 476669.5 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 24729.5 ns 24750 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 28125 ns 28937.5 ns 0.97
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 27000 ns 26875 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1375 ns 1083 ns 1.27
batchedmm(16, Bsize=4)/forward/GPU/CUDA 13843 ns 16054 ns 0.86
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 81051 ns 81581 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 5479.5 ns 4896.5 ns 1.12
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5167 ns 4917 ns 1.05
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5270.5 ns 5333 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4708 ns 4229 ns 1.11
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 110586.5 ns 206611 ns 0.54
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 379244 ns 377863 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 308792 ns 306208 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305625 ns 305084 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 307291 ns 309729.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 306834 ns 307625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102299 ns 224320 ns 0.46
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 272803 ns 274612 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 544417 ns 531959 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 575000 ns 543458 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 545958.5 ns 535333.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 538167 ns 542209 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 500049 ns 1058263 ns 0.47
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 849309 ns 853108 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22000 ns 22084 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21083 ns 21083 ns 1
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22042 ns 23542 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19667 ns 19459 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 64471.5 ns 112165.5 ns 0.57
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 78011 ns 78361 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 226000 ns 221750 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 245604 ns 217666.5 ns 1.13
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215584 ns 224750 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212791 ns 222416 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 344357 ns 732048.5 ns 0.47
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 535535 ns 533125 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7542 ns 6958 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5791.5 ns 6958 ns 0.83
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8416 ns 9208 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7167 ns 6417 ns 1.12
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 63232 ns 137815 ns 0.46
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 65391 ns 65160 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13667 ns 9958 ns 1.37
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11916 ns 10792 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10125 ns 10541 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10041 ns 9875 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 396144.5 ns 815812 ns 0.49
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 386814 ns 385314 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6541.5 ns 4750 ns 1.38
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4666 ns 5208 ns 0.90
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6500 ns 6271 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7042 ns 5000 ns 1.41
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 64824 ns 141314 ns 0.46
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 68750 ns 66780 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8083 ns 7709 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8166 ns 7916 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7708 ns 7875 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7583 ns 7959 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 423945 ns 775695 ns 0.55
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 394914 ns 388324 ns 1.02
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14516708 ns 14550291 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 7713187.5 ns 7721375 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 7704854 ns 7712187.5 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27801334 ns 27857958 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 531151.5 ns 529799 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 393889 ns 389819 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46558771.5 ns 46686916.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 26529584 ns 26553583 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 26598312 ns 26597104.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85686792 ns 85700209 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 3208907 ns 2648481 ns 1.21
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3300533 ns 3297251 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 67833 ns 66125 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 65625 ns 68667 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69333.5 ns 70437.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 67292 ns 66917 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 68650 ns 117160.5 ns 0.59
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 232393 ns 233212 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 450333 ns 455375 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 453834 ns 452500 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 446417 ns 453833.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 441584 ns 441375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 394734 ns 721437 ns 0.55
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 788457.5 ns 786047 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 26112 ns 32085 ns 0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 47140 ns 47371 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10542 ns 8667 ns 1.22
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9583 ns 9042 ns 1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9250 ns 10000 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10708 ns 8458 ns 1.27
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 152524.5 ns 282627 ns 0.54
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 373324 ns 375423.5 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9833 ns 9792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9792 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9833 ns 9792 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9833 ns 9833 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 20835 ns 22901 ns 0.91
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 208092 ns 208212 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 46333 ns 45625 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45833 ns 45958 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46000 ns 45875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45959 ns 45917 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 189222 ns 288260 ns 0.66
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 603691 ns 607426 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56334 ns 56625 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 56375 ns 56833 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 56458 ns 56834 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57875 ns 58250 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 21828 ns 28250 ns 0.77
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 202032 ns 202042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 464834 ns 496854 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 474250.5 ns 504833 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 465771 ns 482959 ns 0.96
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 434770.5 ns 434145.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 162400 ns 242768 ns 0.67
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 877129 ns 877308 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 651104.5 ns 642729 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 683542 ns 659250 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 656292 ns 650437.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 616541.5 ns 609291.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 140209 ns 203473.5 ns 0.69
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 305778 ns 309673 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2262562.5 ns 2253979 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2231521 ns 2246042 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2245125 ns 2231375 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2244604.5 ns 2238292 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 644538 ns 956636.5 ns 0.67
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1307248 ns 1324473 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21625 ns 20292 ns 1.07
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20833 ns 23500 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23208 ns 24250 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20125 ns 19333 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 69407.5 ns 111824.5 ns 0.62
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 78811 ns 80571 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 233042 ns 271000 ns 0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 233125 ns 258000 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221333 ns 231875 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 224875 ns 221125 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 410361 ns 720921 ns 0.57
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 557581 ns 554706 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 18190 ns 22764 ns 0.80
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 47870 ns 47580 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9812.5 ns 9541 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9250 ns 9625 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10042 ns 10208 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10000 ns 9333 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 136633 ns 264550 ns 0.52
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 397114 ns 398354 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8958 ns 10750 ns 0.83
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8438 ns 8875 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10750 ns 11125 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8084 ns 8917 ns 0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 64696.5 ns 117075.5 ns 0.55
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 71891 ns 69781 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7666 ns 7500 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7750 ns 0.94
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8417 ns 8083 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7708 ns 7750 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 292900 ns 498929 ns 0.59
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 318078 ns 322428 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1542 ns 1458 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1458 ns 1584 ns 0.92
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2208 ns 2000 ns 1.10
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1708 ns 1541 ns 1.11
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 13397 ns 20430 ns 0.66
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 188372 ns 188361 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3312.5 ns 3292 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3375 ns 3458 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3667 ns 3541 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3375 ns 3208 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 117821 ns 218522.5 ns 0.54
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 578906 ns 578345 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 147437.5 ns 148312.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 106312.5 ns 105937.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 107750 ns 108125 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 226021 ns 226084 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 16777 ns 23769 ns 0.71
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 40540 ns 40471 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 163417 ns 173291.5 ns 0.94
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 106833 ns 104500 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 98125 ns 105208 ns 0.93
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 251458 ns 287062 ns 0.88
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 141681 ns 215904 ns 0.66
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 266553 ns 268567 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7250 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 5333 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 5416 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10209 ns 10416 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26669.5 ns 32778 ns 0.81
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48681 ns 48640 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 256208 ns 226583 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 258709 ns 229645.5 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231395.5 ns 238083 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 224896 ns 213229.5 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 185868.5 ns 258784 ns 0.72
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 589590.5 ns 595636 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 16125 ns 15375 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 14750 ns 15125 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 17000 ns 16959 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15375 ns 15083 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 76403.5 ns 137028 ns 0.56
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 230202 ns 230152 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24416 ns 23500 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23708 ns 24208 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23792 ns 24500 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23417 ns 24375 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 496390.5 ns 858623.5 ns 0.58
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 676296.5 ns 679476 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 10334 ns 9750 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9375 ns 10104.5 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11666.5 ns 11000 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9292 ns 9084 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 81566 ns 120301.5 ns 0.68
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 72771 ns 74161 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14333 ns 13875 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13666.5 ns 14646 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14729.5 ns 15000 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14750 ns 13958 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 412717 ns 655428 ns 0.63
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 362433 ns 366138.5 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8917 ns 10250 ns 0.87
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9750 ns 10625.5 ns 0.92
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11896 ns 11792 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9542 ns 9125 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 84716 ns 119866.5 ns 0.71
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 71721 ns 72421 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13250 ns 12208 ns 1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12521 ns 12791.5 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13542 ns 13084 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12875 ns 12875 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 346105 ns 541791 ns 0.64
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 338603.5 ns 341643 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 31041.5 ns 30750 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 32438 ns 32333 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 29625 ns 29792 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2167 ns 1625 ns 1.33
batchedmm(2, Bsize=128)/forward/GPU/CUDA 14504 ns 16024 ns 0.91
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 80601 ns 80551 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5250 ns 5042 ns 1.04
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 4750 ns 5458 ns 0.87
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5208 ns 5083 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6541 ns 6209 ns 1.05
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 107471 ns 139561 ns 0.77
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 370164 ns 368314 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 18911 ns 25032 ns 0.76
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 46920 ns 46980 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6542 ns 6167 ns 1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6292 ns 6666.5 ns 0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6958.5 ns 6958 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6708 ns 6125 ns 1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 135126 ns 184207 ns 0.73
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 386254 ns 388954 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 2000 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 1958 ns 2042 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2083 ns 2083 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2042 ns 1959 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 20048 ns 26042 ns 0.77
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 204122 ns 204582 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16937.5 ns 17083 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17042 ns 16875 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17000 ns 16896 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15875 ns 16584 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 151188.5 ns 271146.5 ns 0.56
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 698796.5 ns 701017 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 150292 ns 147458 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 188375 ns 175562.5 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 152834 ns 153292 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 152750 ns 152541 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 169794 ns 195620 ns 0.87
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 225092 ns 226692 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1328166 ns 1323500 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1339625 ns 1327791 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1339979 ns 1331125 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1321375 ns 1301042 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 738732.5 ns 891045 ns 0.83
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1067311 ns 1116140.5 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 26042 ns 25000 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25313 ns 24437.5 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28208 ns 28250 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 25750 ns 25979.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 179072 ns 231362.5 ns 0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 113981 ns 115561 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 181083.5 ns 178562 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 169917 ns 126166 ns 1.35
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 118875 ns 178437.5 ns 0.67
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 125563 ns 157500 ns 0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 736737.5 ns 1053949 ns 0.70
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 606996 ns 608216 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 334 ns 0.87
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 17782 ns 22518 ns 0.79
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 47020 ns 47580 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6917 ns 6416 ns 1.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6500 ns 6834 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7270.5 ns 7020.5 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6958 ns 6417 ns 1.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 149426 ns 200663 ns 0.74
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 389994 ns 396354 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6209 ns 7062.5 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5708 ns 5874.5 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7666 ns 7791 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5958 ns 6791 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 100369 ns 142964.5 ns 0.70
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 231643 ns 231792 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10083 ns 10208.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9666.5 ns 10250 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10333 ns 10500 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10125 ns 10333 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 656519 ns 887713 ns 0.74
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 676037 ns 669276 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 708 ns 667 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 625 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 20098 ns 22120 ns 0.91
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 205502 ns 205382 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4667 ns 4667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4584 ns 4833 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4875 ns 4833 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4709 ns 4584 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 183686.5 ns 224988.5 ns 0.82
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 577406 ns 575835.5 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8062 ns 8167 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8083 ns 8437 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10062 ns 9833 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7979.5 ns 7958 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 112521 ns 119167.5 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 75781 ns 74331 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 8416 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8750 ns 8938 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9459 ns 9625 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8959 ns 8458 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 542270 ns 578635 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 339298.5 ns 344473 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 126979.5 ns 126875 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 100291 ns 97229 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 97208 ns 97333.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 180729.5 ns 183291.5 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/CUDA 44342 ns 45455.5 ns 0.98
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 101011 ns 101051 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 340250 ns 340292 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 192146 ns 182250 ns 1.05
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 167166 ns 191959 ns 0.87
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 573958.5 ns 612416.5 ns 0.94
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 199334 ns 191737 ns 1.04
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 515465 ns 516500 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399208 ns 399042 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 215250 ns 215417 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215625 ns 215333 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756875 ns 756333 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 40054 ns 43626 ns 0.92
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 80551 ns 81280 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1406459 ns 1398374.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 862312 ns 864000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 864000 ns 864270.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2359542 ns 2358708.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 234952 ns 253991.5 ns 0.93
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 353324 ns 350903.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 659917 ns 653500 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 658270.5 ns 655916 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 624271 ns 653041.5 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 677791.5 ns 622146 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196665.5 ns 201217.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 305543 ns 306973 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2481875 ns 2461125.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2467479.5 ns 2469625 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2476313 ns 2481375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2446833 ns 2480333 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 984615.5 ns 998464.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1399689 ns 1392463.5 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 34062.5 ns 32521 ns 1.05
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 34666.5 ns 34291 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 32791.5 ns 33084 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 958 ns 833 ns 1.15
batchedmm(2, Bsize=32)/forward/GPU/CUDA 14044 ns 15542.5 ns 0.90
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 84401 ns 78871 ns 1.07
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3166.5 ns 3000 ns 1.06
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3166 ns 3417 ns 0.93
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3500 ns 3500 ns 1
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3250 ns 3042 ns 1.07
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 121484 ns 141700 ns 0.86
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 362074 ns 337663 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 406584 ns 408916 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 402458 ns 403770.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 403000 ns 404375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 420645.5 ns 423959 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36583 ns 43511.5 ns 0.84
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 238852 ns 237932 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3879583 ns 3878166.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3983541.5 ns 3999042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3998250 ns 4003416 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3674250 ns 3792395.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 237279.5 ns 245738 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1428125 ns 1432279 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 32312 ns 34288 ns 0.94
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 38101 ns 37921 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15459 ns 15459 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15459 ns 15666 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15666 ns 15666 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15458 ns 15458 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 242437.5 ns 258924 ns 0.94
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 167902 ns 173651.5 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404458 ns 404583 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 221625 ns 220833 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 221375 ns 221125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760125 ns 760833 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 117928 ns 113269 ns 1.04
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 87841 ns 87641 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1429417 ns 1424020.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 887583 ns 888041.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 887396 ns 888875 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2378208 ns 2382770.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 192870.5 ns 245573 ns 0.79
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 353053 ns 354303 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 458 ns 583 ns 0.79
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 583 ns 583 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 19335 ns 25789 ns 0.75
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 205012 ns 204972 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7459 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7667 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8166 ns 7958 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8167 ns 7250 ns 1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 165885 ns 217010.5 ns 0.76
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 683986 ns 692821.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 832729.5 ns 832771 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 467000 ns 467416 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 469250 ns 470562.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1575625 ns 1544541 ns 1.02
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129567 ns 129883 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 227872 ns 229272 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2691958.5 ns 2692000 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1537333 ns 1540000 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1540083.5 ns 1542312.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4938125 ns 4931479 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 274489 ns 248014 ns 1.11
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 806443 ns 809797.5 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 334 ns 291 ns 1.15
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 25740 ns 32644 ns 0.79
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 47540 ns 47000 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6625 ns 6208 ns 1.07
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6125 ns 6562.5 ns 0.93
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6791 ns 6916 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6667 ns 6333 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 180518.5 ns 226410 ns 0.80
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 359293 ns 357804 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2375375 ns 2407917 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2422500 ns 2401417 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2407959 ns 2386750 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2370375 ns 2392333 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 178233.5 ns 200791 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 374734 ns 374543.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4668709 ns 4663875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4652084 ns 4666063 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4665083.5 ns 4675291 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4600917 ns 4670208 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 872920 ns 902618 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1382244 ns 1376633 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 9437.5 ns 6875 ns 1.37
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7833 ns 7542 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7292 ns 7250 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6834 ns 6917 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 16361 ns 23477 ns 0.70
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 39440 ns 39221 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 74520.5 ns 32313 ns 2.31
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 49250 ns 49125 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 51729 ns 49583 ns 1.04
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 49083.5 ns 52291.5 ns 0.94
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 212837 ns 219072.5 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 266233 ns 262272 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 22250 ns 21666.5 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25000 ns 24541.5 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 21854.5 ns 22416.5 ns 0.97
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5375 ns 5166 ns 1.04
batchedmm(2, Bsize=512)/forward/GPU/CUDA 15953 ns 18191 ns 0.88
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 83861 ns 82841 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11834 ns 11979 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 9187.5 ns 9645.5 ns 0.95
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 9520.5 ns 9541.5 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 18354.5 ns 18062.5 ns 1.02
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 203711.5 ns 231197.5 ns 0.88
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 388864 ns 365714 ns 1.06
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406375 ns 406041 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 223500 ns 223459 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 223792 ns 223375 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762958 ns 762584 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 43379 ns 46689.5 ns 0.93
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 89781 ns 87501 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1427542 ns 1427542 ns 1
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 892959 ns 894125 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 892958 ns 896417 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2385625 ns 2384229 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 239711 ns 287677.5 ns 0.83
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 376923.5 ns 377703 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 434375 ns 434334 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 430000 ns 430229.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 430417 ns 430333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 448375 ns 447583 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 46179 ns 55000 ns 0.84
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 235662 ns 233247 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3912500 ns 3915625 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4004000 ns 4018146 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4025375.5 ns 4025959 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3768792 ns 3782667 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 251012 ns 265792.5 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1368994 ns 1207206.5 ns 1.13
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8750 ns 8750 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 6875 ns 6875 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 6917 ns 6875 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12458 ns 12416 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 20602 ns 24680 ns 0.83
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 209952 ns 210232 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 44958 ns 44583 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45083 ns 44959 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45250 ns 44875 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 44750 ns 44667 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 314279 ns 349913 ns 0.90
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 653907 ns 651936 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 115896 ns 119750.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 125812.5 ns 123750 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 126604.5 ns 89667 ns 1.41
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89000 ns 81771 ns 1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 186375.5 ns 189502 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 219802 ns 218452 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2026583 ns 2022125 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2025000 ns 2026083 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2024729.5 ns 2027729 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2026520.5 ns 2023895.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 566645 ns 540867 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1084851 ns 1089800 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.