Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: zero out shadows
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 15, 2024
1 parent 5eda749 commit 7ba127a
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/impl/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ function EnzymeRules.augmented_primal(cfg, ::EnzymeCore.Const{typeof(matmuladd!)
!(C isa EnzymeCore.Const) ? copy(B.val) : nothing

if !(C isa EnzymeCore.DuplicatedNoNeed || C isa EnzymeCore.BatchDuplicatedNoNeed)
matmuladd!(C.val, A.val, B.val, bias.val)
matmuladd!(C.val, opmode.val, A.val, B.val, bias.val)
end

return EnzymeRules.AugmentedReturn(nothing, nothing, (A_cache, B_cache))
Expand Down
3 changes: 3 additions & 0 deletions test/common_ops/dense_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ end

(act === identity && hasbias) || continue

dweight .= 0
dx .= 0
db .= 0
Enzyme.autodiff(Reverse, matmuladd!, Duplicated(y, copy(dy)),
Duplicated(weight, dweight), Duplicated(x, dx), b_enz)

Expand Down

3 comments on commit 7ba127a

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/115248

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.2.2 -m "<description of version>" 7ba127aaa8d5eed172dca63133232e097d1d5f21
git push origin v1.2.2

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 7ba127a Previous: 987fce9 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4667 ns 6083 ns 0.77
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6666.5 ns 6250 ns 1.07
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7500 ns 8104 ns 0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5750 ns 5333 ns 1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 117321 ns 127763 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2723919 ns 2680722 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 3008750 ns 817500 ns 3.68
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 404195 ns 410844 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9896 ns 9771 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9833 ns 9958 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9979 ns 9834 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9958.5 ns 9958 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 533872 ns 539870 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18512917 ns 18273784 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2324292 ns 2523292 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 674968 ns 669947 ns 1.01
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1437.5 ns 2812.5 ns 0.51
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 2875 ns 1416 ns 2.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 2083 ns 1584 ns 1.32
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1437.5 ns 1333 ns 1.08
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 21479 ns 21455 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1282166 ns 1323661 ns 0.97
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 190209 ns 216625 ns 0.88
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 29540 ns 28950 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4250 ns 4458 ns 0.95
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4167 ns 3375 ns 1.23
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4145.5 ns 4167 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4375 ns 4000 ns 1.09
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 144438.5 ns 142970.5 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 9108147.5 ns 10240879 ns 0.89
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1604875 ns 1524333 ns 1.05
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 145092 ns 149491.5 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 55875 ns 57833 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39209 ns 40417 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46625 ns 46375 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84167 ns 83000 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36824 ns 36725 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 542002 ns 558408 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1333104 ns 1040458 ns 1.28
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 81391 ns 81776 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2024917 ns 2036667 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2079125 ns 2086500 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2081625 ns 2090375 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1993125 ns 1993667 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 226688 ns 226490 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7623752 ns 7533597 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7427958 ns 8034167 ns 0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1252074 ns 986919 ns 1.27
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 174750 ns 146666 ns 1.19
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 164541.5 ns 151000 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 148812.5 ns 151062.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144375 ns 194750 ns 0.74
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165480 ns 166182 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7680925 ns 7689190 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1457521 ns 1596770.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 204852 ns 209312 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1117250 ns 1113896 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1109375.5 ns 1120062.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1113334 ns 1119104 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1112187.5 ns 1106542 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 694582 ns 695636.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33705507.5 ns 34400023 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6238375 ns 7210396 ns 0.87
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1026961 ns 1024730 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4417 ns 5291 ns 0.83
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5041 ns 4916 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5208 ns 6125 ns 0.85
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4583 ns 4375 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 93299.5 ns 91792.5 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5368327 ns 5267805 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 634041.5 ns 474000 ns 1.34
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 69460 ns 67381 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8375 ns 8750 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 8917 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8833 ns 8792 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8833 ns 8687.5 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 604485 ns 600359 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 36365543 ns 36489972 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5669937.5 ns 5930125 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388374 ns 390114 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17000 ns 17562.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17709 ns 17979 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18021 ns 20812.5 ns 0.87
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16895.5 ns 17750 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 66654.5 ns 66076.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 2923981.5 ns 3263389.5 ns 0.90
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 477833 ns 1274334 ns 0.37
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 78451 ns 76030 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216834 ns 212792 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219896 ns 213000 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 225583.5 ns 218292 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217625 ns 254395.5 ns 0.86
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 356473 ns 351925 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 14201022 ns 15484392 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5644395.5 ns 5673084 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 465005 ns 468334.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 625 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 750 ns 708 ns 1.06
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 812.5 ns 770.5 ns 1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 666 ns 0.94
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 20462 ns 20050 ns 1.02
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1162134.5 ns 1150135.5 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 302625 ns 295625 ns 1.02
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32870 ns 32420 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1417 ns 1459 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1458 ns 1520.5 ns 0.96
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1417 ns 1459 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1416 ns 1500 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 125127 ns 122512.5 ns 1.02
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8831211 ns 8913698.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1526500 ns 1644687.5 ns 0.93
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 136521 ns 135591 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7334 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5416 ns 5417 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6042 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10666 ns 10250 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23625 ns 23888.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1207481 ns 1207370.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 356458 ns 446750 ns 0.80
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48881 ns 47420 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 226166 ns 236834 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 265333 ns 241875 ns 1.10
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 234854 ns 269875 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219500 ns 257687.5 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 192027 ns 191906.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 31211143.5 ns 32212683 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9046313 ns 8558250.5 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 649247 ns 645121 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4083 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4083 ns 4083 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4084 ns 4042 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23477 ns 23307 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2001417 ns 2000762.5 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 214833 ns 223875 ns 0.96
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 47261 ns 48080 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 17083 ns 16792 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 17000 ns 16625 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16833 ns 16792 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 17334 ns 16917 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 195303 ns 191629 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 14536946 ns 10282963 ns 1.41
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 918208 ns 937125 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 174652 ns 176282 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 508750 ns 509292 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 330583 ns 332354.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 404666 ns 404834 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 864791 ns 865333 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113620 ns 113483 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 401393 ns 392476 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 490979 ns 487333 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 242133 ns 240773 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2313834 ns 2308770.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1747479 ns 1756875 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2035208 ns 2033625 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3272708.5 ns 3270500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 241207 ns 237569 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 10021457.5 ns 11006777.5 ns 0.91
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 2011770.5 ns 2028666.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 743443 ns 739942 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4708.5 ns 6062.5 ns 0.78
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7625 ns 6584 ns 1.16
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7708 ns 8208.5 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5479.5 ns 6875 ns 0.80
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 92351.5 ns 91839.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5442998 ns 5704966 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 783479 ns 776250 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 65411 ns 65360 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10333.5 ns 11041.5 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11875 ns 11875 ns 1
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11750 ns 11125 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12062.5 ns 12187.5 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 634956 ns 637048 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 40400531.5 ns 37465688 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5457291.5 ns 5651896.5 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 409979.5 ns 408644 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 583 ns 542 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23181 ns 22899 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2216579 ns 1980954 ns 1.12
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 332584 ns 214375 ns 1.55
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 47221 ns 49101 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2166 ns 2084 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2083 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2084 ns 2208 ns 0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 215755 ns 228216 ns 0.95
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 11357397.5 ns 11133138.5 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 1978417 ns 2019750 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 172626.5 ns 180086.5 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8937.5 ns 9083 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9729.5 ns 8500 ns 1.14
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9459 ns 10833.5 ns 0.87
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8958 ns 8542 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 96639 ns 108383.5 ns 0.89
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3207607 ns 3207332 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 876000 ns 816208 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 71941 ns 74171 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18521 ns 16875 ns 1.10
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 19104.5 ns 18792 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17625 ns 18250 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18812.5 ns 17812.5 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 554001 ns 615805 ns 0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 16517942.5 ns 16767446 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5180916.5 ns 5170312.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 378539 ns 383838.5 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 458 ns 500 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 666 ns 666 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 35213 ns 35553 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1186873 ns 1192710.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 466396 ns 293146 ns 1.59
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 46270 ns 46141 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9312.5 ns 8541.5 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9916.5 ns 8541 ns 1.16
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9167 ns 9958.5 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9458.5 ns 9458.5 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 267136 ns 264293 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 18948901 ns 18241947 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4572250 ns 5274687.5 ns 0.87
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 367694 ns 366223 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 395333 ns 396958 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 214416 ns 215500 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288292 ns 287792 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756291 ns 755333 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111882 ns 110939.5 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 329474.5 ns 326929 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 300208.5 ns 365521 ns 0.82
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 77331 ns 74351 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1453791.5 ns 1446854 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 852583 ns 859125 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1132645.5 ns 1132854 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2440625 ns 2436292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 207032 ns 204467 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 10204120 ns 8967194.5 ns 1.14
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1668041.5 ns 1574375 ns 1.06
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 324428.5 ns 321063 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7041.5 ns 7187.5 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7750 ns 7270.5 ns 1.07
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9396 ns 8541.5 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7791.5 ns 6979.5 ns 1.12
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 144806.5 ns 145872 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5813106.5 ns 5766375 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 437250 ns 448125 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 66071 ns 65611 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13083 ns 14770.5 ns 0.89
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14479 ns 16916.5 ns 0.86
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15709 ns 15687.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15354.5 ns 15562.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 956377 ns 956937.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42729213 ns 42931711 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5700250 ns 6186333 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 428955 ns 421904 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24000 ns 25292 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24875 ns 25292 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 29292 ns 28583.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 27667 ns 30125 ns 0.92
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 199144 ns 198270.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7744284 ns 7924119 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 999584 ns 654625 ns 1.53
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 116931 ns 113131 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103583 ns 157000 ns 0.66
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 152687 ns 118479 ns 1.29
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 153583 ns 118792 ns 1.29
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 151000 ns 145083.5 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1075746 ns 1072793 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43042130 ns 41512479 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5733792 ns 5879750 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 590946.5 ns 587055 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75000 ns 76417 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 77084 ns 74917 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 86333.5 ns 80458 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74875 ns 82834 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 205585 ns 204563.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8027595.5 ns 7289524 ns 1.10
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 519187.5 ns 532021 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 127562 ns 126591 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 293542 ns 263209 ns 1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 308750 ns 316562 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 315187.5 ns 248479.5 ns 1.27
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 304208 ns 210125 ns 1.45
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1108118 ns 1111658.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 40422383 ns 39831914 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6276458 ns 6266000 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 695017 ns 691997 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 15875 ns 16771 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17521 ns 16791.5 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 18500 ns 17542 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16958 ns 16750 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 146489 ns 144759.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5586208 ns 5606829 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 723083.5 ns 474208 ns 1.52
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 232683 ns 232022 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26667 ns 26895.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26687.5 ns 25167 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 28208.5 ns 27333 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27708.5 ns 24167 ns 1.15
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 982068.5 ns 972458 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 40344043 ns 41939896 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5743229 ns 6295958 ns 0.91
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 686807.5 ns 695306.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11083 ns 11209 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12042 ns 11333.5 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12334 ns 12416.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 10791 ns 11042 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 124134 ns 122668.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3473152 ns 3386989.5 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 880000 ns 858500 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 234213 ns 233942 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21958 ns 21584 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22729.5 ns 22563 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21895.5 ns 22583 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22000 ns 21291 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 701831.5 ns 697229 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 21157140 ns 21507216 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5204750 ns 5485375 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 674667 ns 669687 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 63437.5 ns 63104 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 65521 ns 66479 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66750 ns 66584 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 63042 ns 64208.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 106345.5 ns 105012.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3373870 ns 3348443 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 480667 ns 1297624.5 ns 0.37
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 233433 ns 232172 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 437896 ns 440625 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 456000 ns 448937.5 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 450542 ns 440917 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 444000 ns 438250 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 515188 ns 511759 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 21597008 ns 20624860 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6095791.5 ns 5921625 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 717017.5 ns 713498 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6792 ns 7521 ns 0.90
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8000 ns 8084 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8583.5 ns 8667 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6917 ns 7750 ns 0.89
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 146052.5 ns 143457 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5510181.5 ns 5597779 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 726500 ns 446771 ns 1.63
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 65301 ns 64960 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14292 ns 14875 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15292 ns 15709 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14084 ns 16542 ns 0.85
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16209 ns 15541.5 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 947670 ns 938762 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 39845105 ns 39706040 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5499875 ns 5775541 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 399764 ns 398045 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6131500 ns 6154854 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 3224875 ns 3224917 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6379229.5 ns 6376292 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11911084 ns 11902583 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 349856 ns 347379 ns 1.01
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 303248 ns 297978.5 ns 1.02
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19059708.5 ns 19104063 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 11090437.5 ns 11143020.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 20005646 ns 19964417 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36446770.5 ns 36518125 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1081781.5 ns 1020967.5 ns 1.06
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1153782 ns 1158972 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 958 ns 958 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1000 ns 1000 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 958 ns 1000 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 917 ns 958 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23071 ns 22897 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2085318 ns 2091957.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 332541.5 ns 232500 ns 1.43
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 207622 ns 206842 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3667 ns 3708 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3750 ns 3709 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3708 ns 3792 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3667 ns 3667 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 281551.5 ns 277378 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 12095727 ns 11186074 ns 1.08
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2129583 ns 2130584 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 626307 ns 626357 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8042 ns 7750 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8145.5 ns 7937.5 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9042 ns 9771 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7937.5 ns 7437.5 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 121104 ns 119515 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3679976 ns 3487658 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 802541.5 ns 816562.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 65471 ns 65701 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 13125 ns 11208 ns 1.17
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12875 ns 13416.5 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11417 ns 12834 ns 0.89
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12708 ns 11584 ns 1.10
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 638151 ns 631148 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22685670 ns 21438278 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 4390333 ns 5005375 ns 0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 355644 ns 354774 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22337 ns 22106 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2195388.5 ns 2144977 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 207833 ns 226937 ns 0.92
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 47401 ns 46510 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3042 ns 2875 ns 1.06
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3375 ns 3000 ns 1.13
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2916 ns 2917 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3333 ns 2958 ns 1.13
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 204047 ns 199810.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 14763707.5 ns 9182273 ns 1.61
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1611395.5 ns 1664167 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 157641.5 ns 161676.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10250 ns 11625 ns 0.88
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12167 ns 11979 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12187.5 ns 13333 ns 0.91
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10604 ns 11604.5 ns 0.91
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 121713.5 ns 120755 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3281210 ns 3560641 ns 0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 904791.5 ns 1031500 ns 0.88
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 233512.5 ns 233163 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21104.5 ns 20687.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 22583 ns 20583 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21083 ns 23000 ns 0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21708 ns 20541.5 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 595173 ns 590597 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20531194.5 ns 20721086 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4095583 ns 4786083 ns 0.86
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 638246.5 ns 646557 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4417 ns 4375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4417 ns 4417 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24193.5 ns 23934 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2211530 ns 2235095.5 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 215041 ns 221479.5 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 47690 ns 47181 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16292 ns 16667 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16291 ns 16541 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16667 ns 16709 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16416 ns 16708 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 330020.5 ns 326329 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12280627 ns 12543391.5 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1639709 ns 1672458 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 206457.5 ns 204152 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1917 ns 2084 ns 0.92
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2167 ns 2125 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2084 ns 2083 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2084 ns 1958 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35891 ns 35852 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1213015 ns 1224950 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 474917 ns 293583 ns 1.62
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 204052 ns 203142 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 19687.5 ns 18208 ns 1.08
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 17187.5 ns 17187.5 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17750 ns 18041.5 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16667 ns 17021 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 293976.5 ns 291174 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21212198 ns 21237766 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4767354.5 ns 5676396 ns 0.84
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 686777 ns 684357.5 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 55771 ns 60208.5 ns 0.93
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 62792 ns 62042 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 65604.5 ns 65750 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51333 ns 51250 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66418 ns 66352.5 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 114241 ns 112971 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 202896 ns 188541.5 ns 1.08
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 135104 ns 140250.5 ns 0.96
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 130083 ns 124249.5 ns 1.05
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 245666 ns 220125 ns 1.12
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 215296 ns 213978 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 607861 ns 616297 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 79709 ns 84479 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 107104 ns 83666.5 ns 1.28
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85167 ns 86167 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 124166.5 ns 125666 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192861 ns 193270.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5531381 ns 5699293.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1816084 ns 1963979.5 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203512 ns 204042 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1869895.5 ns 1887292 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1901084 ns 1916521 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1917666.5 ns 1912333 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1889333 ns 1806250 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 531825 ns 528167 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32650285 ns 24408984.5 ns 1.34
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8859584 ns 9102667 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 925670 ns 1064601.5 ns 0.87
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 291 ns 250 ns 1.16
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21389 ns 21230 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2065883 ns 2190815.5 ns 0.94
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 336229.5 ns 367541.5 ns 0.91
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 42770.5 ns 41291 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1834 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 253832 ns 249025 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 10417238 ns 10051558 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1009479 ns 1526271 ns 0.66
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 184376.5 ns 182202 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8000 ns 8583 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10042 ns 9542 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10375 ns 10604 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8167 ns 8125 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 119090.5 ns 117788.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3309191 ns 3476276 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 876708 ns 921312.5 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 232622 ns 232182 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9083 ns 9000 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10625 ns 8958 ns 1.19
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9542 ns 11292 ns 0.85
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10125 ns 9145.5 ns 1.11
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 527209 ns 518629.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22247571 ns 19406043 ns 1.15
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 3949187.5 ns 4477584 ns 0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 624237 ns 626986 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 56166 ns 57458 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 38916 ns 39875 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46125 ns 46750 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83958 ns 82583 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40233 ns 39259 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1343252 ns 1309251 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1123667 ns 1121542 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 76266 ns 74341 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1923750 ns 1867542 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1952750.5 ns 1978791 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1982854 ns 1977229 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1850708.5 ns 1853979.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 221906.5 ns 219172 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33376877 ns 32964288 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11408021 ns 11253292 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1191052 ns 1160142 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 416333 ns 419229.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 421645.5 ns 435958 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 421208.5 ns 420208 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 417667 ns 417291.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 208798 ns 208124 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7659621 ns 8033766 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 518208 ns 539333.5 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 282883 ns 280723 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 747916.5 ns 718729.5 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 671583 ns 670917 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 673562.5 ns 681646 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 748021 ns 671125 ns 1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1048327.5 ns 1045689 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45569778.5 ns 44612818 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6335208.5 ns 6579583 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 914290 ns 909619.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3428937.5 ns 3431646 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3384709 ns 3418041.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3435000 ns 3459666 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3417875 ns 3424604 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 175238.5 ns 172982 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8069034 ns 8225049 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1424083 ns 1418875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 426124 ns 438875 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6191270.5 ns 6211958.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6170041 ns 6239125 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6167416.5 ns 6228166.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6190792 ns 6164812.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 994959 ns 989377 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 50094330 ns 49957898 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7413750 ns 7609083 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1549811 ns 1545101 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 470666 ns 470459 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 252458 ns 254333 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 342417 ns 342000 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 901125 ns 901833 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46139 ns 45850.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 884569 ns 874511 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 368208 ns 485291 ns 0.76
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 243602 ns 241413 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2334750 ns 2331458 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1752562 ns 1762250 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2041187.5 ns 2040791.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3280124.5 ns 3281083 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 255952 ns 263882 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 12850913 ns 13135947 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2244770.5 ns 2243500 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 770018 ns 765467.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 55708 ns 57083 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39041 ns 38854.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46020.5 ns 46125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84125 ns 82875 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28321 ns 28162 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1407008 ns 1368315 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1106875 ns 1138958 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 76505.5 ns 74570.5 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2029708 ns 2033792 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2082292 ns 2094125 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090958 ns 2089041.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1949604 ns 2003042 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 232547 ns 231932 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35887652 ns 35712411 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11649979 ns 11300791.5 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1052311 ns 1044461 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 55833 ns 57500 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39083.5 ns 39917 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46375 ns 46500 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84042 ns 82625 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 49287 ns 48905 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 790006.5 ns 744836.5 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1049084 ns 1117520.5 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 69820 ns 64946 ns 1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1919458 ns 1922750 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1955416.5 ns 1974334 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1946334 ns 1956833.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1890750 ns 1889708 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 239685 ns 239067 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 17609091 ns 16476478 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9788042 ns 9755374.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 918859 ns 916609 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34717 ns 35081.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1181143 ns 1290014 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 263500 ns 287438 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 46211 ns 45840 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6333 ns 6541 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7500 ns 6687.5 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6583 ns 7000 ns 0.94
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7000 ns 6500 ns 1.08
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 208392.5 ns 205115.5 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 20162243 ns 20319441.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4479667 ns 5303083 ns 0.84
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 365124 ns 367174 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32562 ns 31894 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1251080 ns 1192240 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 258000 ns 254292 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 37000 ns 36310 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2750 ns 3334 ns 0.82
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3625 ns 2958 ns 1.23
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2709 ns 3167 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2917 ns 2958 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 189309.5 ns 185317.5 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7798739 ns 7518628 ns 1.04
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 905666.5 ns 1115709 ns 0.81
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 151136.5 ns 149472 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 467667 ns 422083 ns 1.11
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 444750 ns 423833 ns 1.05
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 425999.5 ns 427834 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 421833.5 ns 424937.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 137895 ns 137292 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5774821 ns 5779699.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2386500 ns 2076458 ns 1.15
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 367024 ns 366143.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3802521 ns 3813229.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3765917 ns 3824249.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3811417 ns 3788084 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3799541.5 ns 3812042 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 709425 ns 705310 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33554230 ns 31262641 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10457896 ns 10824937.5 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1471404 ns 1464005 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49735229.5 ns 49892959 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 25984959 ns 26011834 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35560875 ns 35523145.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96902041.5 ns 97645833 ns 0.99
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1616773 ns 1616287 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1045271 ns 1048102 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 153907333 ns 154680021 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 89247291.5 ns 88850291.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112379750 ns 112398500 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 294166500 ns 298306271 ns 0.99
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6515848 ns 6498761 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5562255.5 ns 5545318 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 14521 ns 19937.5 ns 0.73
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 14958 ns 15167 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 16833 ns 17041.5 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 14854.5 ns 14792 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 20539.5 ns 20017 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1114507 ns 1149888 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 206959 ns 229541 ns 0.90
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 26060 ns 27001 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10625 ns 10417 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 7771 ns 7250 ns 1.07
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9208 ns 9104 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17437.5 ns 17375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 260548 ns 257217 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 9528073.5 ns 9674368 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1587125 ns 1641396 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 149326.5 ns 147861 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7958 ns 8063 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9292 ns 9125 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9500 ns 10667 ns 0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7958.5 ns 8917 ns 0.89
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 116273.5 ns 114750.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3476228 ns 3651219 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 810375 ns 861125 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 233683 ns 233283 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9208.5 ns 9792 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10645.5 ns 10750 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10208 ns 10917 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10375 ns 10271 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 619508.5 ns 614307 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22906068.5 ns 28192305 ns 0.81
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 4432792 ns 5310750 ns 0.83
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 654786 ns 649747 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8291.5 ns 9708 ns 0.85
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10459 ns 10000 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10042 ns 11541 ns 0.87
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9250 ns 9584 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 120531 ns 119206 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3436472 ns 3481764 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 901792 ns 937459 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 71071 ns 72050 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13250 ns 17479.5 ns 0.76
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 16042 ns 14375 ns 1.12
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17208 ns 15125 ns 1.14
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 15167 ns 14667 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 592138 ns 586931 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 18951458.5 ns 19607421 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4027062.5 ns 4735125 ns 0.85
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 345753 ns 343533 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 500 ns 584 ns 0.86
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 541 ns 459 ns 1.18
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34521 ns 34228 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1191899 ns 1215476 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 371562.5 ns 314188 ns 1.18
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 206352 ns 203452 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7062.5 ns 9334 ns 0.76
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8333.5 ns 8604.5 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 9041 ns 0.95
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8000 ns 8250 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 233771 ns 230655 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 23357164 ns 22072831 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4885833 ns 5460541 ns 0.89
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 662116 ns 654892 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 12292 ns 17375 ns 0.71
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 13229 ns 14792 ns 0.89
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 15125 ns 16000 ns 0.95
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10167 ns 10458 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 22042 ns 21718 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1119591.5 ns 1102903 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 189125 ns 208666 ns 0.91
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 189132 ns 184622 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 31875 ns 31542 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32333.5 ns 32000 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32291.5 ns 32208 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 32000 ns 32354.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 276327 ns 271707 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12201192 ns 10769694 ns 1.13
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1697542 ns 1820875 ns 0.93
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 595015.5 ns 588176 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 480875 ns 452584 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 441083 ns 441979.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 450250 ns 467167 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 490979 ns 438521 ns 1.12
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194024 ns 194827 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5766516 ns 5920885 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2629708 ns 1997667 ns 1.32
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 368063.5 ns 368184 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3822958 ns 3829250 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3807354 ns 3838292 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3827834 ns 3802021 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3826167 ns 3830584 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 544349 ns 544632 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 29050298 ns 28778535 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9196542 ns 9720812.5 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1359983 ns 1358284 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 838219667 ns 831986833 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 415052604.5 ns 416264500 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 543102500 ns 543217708 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1525021500 ns 1509789750 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22764607.5 ns 22539644.5 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14772276 ns 14678121 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 3570164958 ns 3779013833 ns 0.94
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1502049709 ns 1885743917 ns 0.80
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 2269221042 ns 1788587042 ns 1.27
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4773617583 ns 4810183875 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 369302709 ns 364565745 ns 1.01
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87924411 ns 88375525 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 79646 ns 75520.5 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 78895.5 ns 76416.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78667 ns 79958.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77583 ns 78625 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 207237 ns 207155.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7871351 ns 7714255 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 520375 ns 534709 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 107601 ns 106301.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 250834 ns 235667 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 294583.5 ns 283229.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 285708.5 ns 247208 ns 1.16
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 222333.5 ns 210874.5 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1049109.5 ns 1048818 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43337417.5 ns 44375934 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6122958 ns 6248084 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 640576 ns 631246 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199656458.5 ns 199488333 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 103769666.5 ns 103922541.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139342042 ns 139224666 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388182208 ns 393811292 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5838796 ns 5835255 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3577840.5 ns 3578582 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 616451521 ns 620321291.5 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 351188291.5 ns 354710917 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 439680896 ns 440219958 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1178137125 ns 1185414250 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26651952 ns 26495134 ns 1.01
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22092888 ns 22065145 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7417 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 5417 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6084 ns 6292 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10167 ns 10145.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27714.5 ns 27466 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1202781 ns 1213453.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 351458 ns 432833 ns 0.81
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48481 ns 47620 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218291.5 ns 213000 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222250 ns 223041 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221209 ns 220917 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213708.5 ns 206896 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 222292 ns 223324 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 31765824 ns 31525343 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9125125 ns 9133958 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 529665 ns 524095 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7271 ns 8854.5 ns 0.82
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9541.5 ns 9312.5 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9791 ns 10583 ns 0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8187.5 ns 9625 ns 0.85
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 117715.5 ns 116401 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3188633 ns 3333892 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 885458 ns 911750 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 69700 ns 69370 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7479 ns 7437.5 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10479.5 ns 8854 ns 1.18
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10875 ns 7959 ns 1.37
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8875 ns 9145.5 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 519786.5 ns 515224 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18597573.5 ns 18606821 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 3961208 ns 4708917 ns 0.84
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 316073 ns 318334 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 750 ns 709 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 26338 ns 25690 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1200694 ns 1183861 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 488604.5 ns 493792 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 46820 ns 46791 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9291 ns 9000 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10416 ns 10791.5 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9208.5 ns 9854.5 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 11583 ns 10042 ns 1.15
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 253612 ns 251338.5 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 25803867.5 ns 23713128.5 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5171833.5 ns 6062250 ns 0.85
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 388624 ns 386044 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 104834 ns 107354.5 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 84834 ns 84667 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 99500 ns 100375 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146333 ns 146729.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 24613 ns 24618 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 1194962 ns 1206806.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 246062.5 ns 266292 ns 0.92
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 192062 ns 190862 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 526854 ns 478500 ns 1.10
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 478875 ns 492271 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 500416.5 ns 481000 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 478958.5 ns 479145.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 232619 ns 230580 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11733131 ns 11914566 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 1709625 ns 2188458.5 ns 0.78
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 610896 ns 605276 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5125 ns 6042 ns 0.85
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 7167 ns 7000 ns 1.02
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 6791 ns 7583 ns 0.90
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4042 ns 6000 ns 0.67
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16580 ns 16947 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 79701 ns 79345.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 11708 ns 12062.5 ns 0.97
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 11584 ns 10542 ns 1.10
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 10792 ns 10917 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17687.5 ns 18208 ns 0.97
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 214143.5 ns 212062.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 366964 ns 367674 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 35792 ns 39750 ns 0.90
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 50791 ns 50708 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 51833.5 ns 52625 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13542 ns 13750 ns 0.98
batchedmm(16, Bsize=128)/forward/GPU/CUDA 21568 ns 19888.5 ns 1.08
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 87241 ns 87991 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 38979.5 ns 36500 ns 1.07
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 30708 ns 28959 ns 1.06
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 30416 ns 31500 ns 0.97
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 58458 ns 58583 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 192010 ns 190552 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 395119 ns 413955 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1729.5 ns 1750 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1937.5 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2146 ns 2125 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1709 ns 1792 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 20594 ns 20369 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1163029.5 ns 1137759 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 326833 ns 312000 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 33120 ns 32711 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2250 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2333 ns 2396 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2250 ns 2333 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2042 ns 2250 ns 0.91
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 204587 ns 201543.5 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 9292587 ns 9195441 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1518500 ns 1575208 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 136826.5 ns 136711 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4417 ns 4562.5 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5250 ns 4708.5 ns 1.12
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6375.5 ns 6834 ns 0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4041.5 ns 5125 ns 0.79
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 145077 ns 144149.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5424296 ns 5753580 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 725208 ns 707854 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 69471 ns 69031 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8041 ns 8167 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8958 ns 9250 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8416 ns 8667 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9208 ns 9209 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 875812.5 ns 867994 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 40742928.5 ns 37396018.5 ns 1.09
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5580917 ns 5747500 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 389804 ns 386354 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56792 ns 56917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 56875 ns 56875 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57584 ns 57833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58375 ns 58125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37054 ns 37109 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1234596.5 ns 1131214.5 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 336000 ns 421167 ns 0.80
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 203242 ns 203222.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 485813 ns 451020.5 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 499958.5 ns 475979 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 468208 ns 465354 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 438854.5 ns 487041.5 ns 0.90
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 268055 ns 264507 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27322975 ns 28501147 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8122166.5 ns 7943604 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 832729 ns 830424 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3291250 ns 3311000 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1764708 ns 1770250 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2339021 ns 2337729.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6260292 ns 6302417 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204625 ns 204131.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 209992 ns 211992 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11332208 ns 11485250 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 6550833 ns 6571812.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8325250 ns 8309250 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 20937125 ns 21151875.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 734916 ns 735481 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1048155.5 ns 1057071 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4291 ns 5125 ns 0.84
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5875 ns 5375 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6583 ns 7125 ns 0.92
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4896 ns 6208.5 ns 0.79
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 137991.5 ns 137212.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5581467 ns 5624260 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 785625 ns 793500 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 56390 ns 56010 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7042 ns 7000 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10562.5 ns 7500 ns 1.41
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7104.5 ns 7458 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7833 ns 9083 ns 0.86
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 754679 ns 754137 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 34960226 ns 34576213 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5245042 ns 5244167 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 371414 ns 366813 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 127625 ns 103250 ns 1.24
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 95624.5 ns 103875 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 100000 ns 125291 ns 0.80
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 95708 ns 101042 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 152137 ns 151348 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5871279.5 ns 6050689.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2635166.5 ns 2052375 ns 1.28
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203242 ns 203192 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2017959 ns 2018375 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2027771 ns 2029000 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2021167 ns 2023521 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1987167 ns 1991417 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 703925.5 ns 703391 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31965494 ns 31442085 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11055292 ns 11046312.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1255893 ns 1250762 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 29375 ns 34667 ns 0.85
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 34500 ns 34750 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 35250 ns 35041.5 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 583 ns 646 ns 0.90
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15622 ns 15242 ns 1.02
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 80130 ns 79571 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2542 ns 2729.5 ns 0.93
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3125 ns 2917 ns 1.07
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2834 ns 3000 ns 0.94
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 3000 ns 2208 ns 1.36
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 141408 ns 139866 ns 1.01
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 343344 ns 342158.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7125 ns 7167 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5375 ns 5417 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6084 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10209 ns 10042 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36671 ns 36552 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1208337 ns 1221281.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 331459 ns 674708 ns 0.49
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48221 ns 48261 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217479 ns 213624.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229625 ns 221166.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 225000 ns 220812.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212875 ns 205833 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 244929 ns 243393.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26091309.5 ns 25870086.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7984187.5 ns 7741583 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 574266 ns 575566 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3959 ns 3958 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21419 ns 21563 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2118188.5 ns 2027782.5 ns 1.04
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 234583 ns 250542 ns 0.94
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42620 ns 43640 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14791 ns 14917 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14750 ns 14791 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14875 ns 14958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14833 ns 14917 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 311492 ns 306375 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 10906139 ns 11210297 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 982000 ns 1037625 ns 0.95
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 192231.5 ns 194327 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 140834 ns 105583 ns 1.33
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 127417 ns 106167 ns 1.20
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 105167 ns 124875 ns 0.84
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 141000 ns 102583 ns 1.37
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 152595 ns 139877 ns 1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6050834 ns 5810927 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2057334 ns 2048416 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 213297 ns 208802 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1917833 ns 1878500 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1898875 ns 1927583.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1922083 ns 1867521 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1898854 ns 1917937.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 692137 ns 684487.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31139112 ns 30087516 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10436541 ns 10640458 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1217872 ns 1063341 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18250 ns 17583 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18625 ns 19500 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20750 ns 20708 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17749.5 ns 18791 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 110137 ns 109550 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3282416 ns 3331480 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 480541.5 ns 1318708 ns 0.36
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79421 ns 80701 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 252041.5 ns 216271 ns 1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 217541.5 ns 222292 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219687.5 ns 217916 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 222729.5 ns 216167 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 519298 ns 516519 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20051825.5 ns 19724665.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6194812.5 ns 6017791.5 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 478425 ns 477585 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 23291.5 ns 26583 ns 0.88
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 28583 ns 28770.5 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 28792 ns 29104 ns 0.99
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1229.5 ns 1334 ns 0.92
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16210 ns 15984 ns 1.01
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 82241 ns 81921 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4292 ns 4833.5 ns 0.89
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4729 ns 4833 ns 0.98
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5042 ns 5208.5 ns 0.97
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 5771 ns 4333 ns 1.33
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 207444.5 ns 206128 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 378084 ns 379654 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 305417 ns 305792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 306250 ns 306042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 308084 ns 306833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 305750 ns 307083 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 228609 ns 227988.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7545946 ns 7778230 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 604584 ns 1241125 ns 0.49
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 273963 ns 272793 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 532917 ns 535708 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 538167 ns 533084 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 539125 ns 538208 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 572709 ns 530917 ns 1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1074383 ns 1080430 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44755027.5 ns 42644591.5 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6115208.5 ns 6182083 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 858603.5 ns 851073.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19291 ns 19125 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20708 ns 20624.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22375.5 ns 21458 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19875 ns 20000 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 114907 ns 112864 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3614583 ns 3473281 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 593916 ns 1444854 ns 0.41
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79421 ns 80611 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215708 ns 220167 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220584 ns 222791.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213625 ns 214771 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215875 ns 212625 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 762395 ns 737028 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 25444001 ns 25214419 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7232562.5 ns 7109375 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 542290.5 ns 531685 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6125 ns 5916 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7083 ns 7083 ns 1
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7917 ns 8604.5 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6208 ns 6500 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 140165.5 ns 140088 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5168559 ns 5562789 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 799291 ns 803937.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 65270 ns 64661 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9542 ns 10000 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10333.5 ns 10937.5 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10375 ns 10750 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11145.5 ns 10041 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 826456 ns 822803 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 37337383 ns 36817844 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5311708 ns 5484583 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 387474 ns 382033 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4875 ns 4334 ns 1.12
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6917 ns 5291 ns 1.31
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7250 ns 7333 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4812.5 ns 5584 ns 0.86
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 144262 ns 142901.5 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5426091.5 ns 5758977.5 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 808375 ns 800458 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 66621 ns 66271 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7458 ns 7208 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8083 ns 7646 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7541.5 ns 7750 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7833 ns 7583 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 783702 ns 782456.5 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 37497088 ns 39501262 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5566229 ns 6034250 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 395004 ns 392794 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14350584 ns 14539375 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 7693688 ns 7723291.5 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10127042 ns 10145625 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27615959 ns 27763416 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 548306 ns 554910 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 393134 ns 393434 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 45943208 ns 46429208.5 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 26437417 ns 26609416 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33454833 ns 33517458 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 84782667 ns 85405667 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2657066 ns 2664805 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3290613 ns 3291838.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 66375 ns 66292 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 68584 ns 67875 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69333.5 ns 68250 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 65979 ns 65917 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 121920.5 ns 119249 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3593431.5 ns 3647654 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 508166 ns 1440312.5 ns 0.35
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 229397.5 ns 232702 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 446833 ns 441250 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 452437.5 ns 441625 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 446375 ns 447167 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 445834 ns 441478.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 728139 ns 727144.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26912797 ns 26208342 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7552104 ns 7477375 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 790108 ns 793922.5 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 666 ns 584 ns 1.14
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 667 ns 583 ns 1.14
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32311 ns 31836 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1198752.5 ns 1180672 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 473500 ns 286667 ns 1.65
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 47340 ns 47841 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8666 ns 9458 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9208 ns 9271 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 8458 ns 9750 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17104 ns 9416 ns 1.82
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 286358 ns 283587 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20778583 ns 22547365 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4681395.5 ns 5502666.5 ns 0.85
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 375004 ns 374188.5 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9875 ns 9792 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9875 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9792 ns 9875 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9833 ns 9875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23012 ns 22851 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2014844 ns 2120178 ns 0.95
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 215645.5 ns 221333 ns 0.97
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 205762 ns 207772 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45958 ns 46167 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 46042 ns 46083 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46041 ns 46417 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 46250 ns 46062.5 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 290878 ns 287950 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 9152947 ns 12273456 ns 0.75
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 942542 ns 1033833.5 ns 0.91
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 607695 ns 600566 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56250 ns 56167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 56458 ns 56875 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57083 ns 57166 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57709 ns 57875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28552 ns 28495 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1253508.5 ns 1157087.5 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 663666.5 ns 660125 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 203541.5 ns 202572 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 448583 ns 448229 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 465562 ns 464979 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 465458.5 ns 472292 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 454041.5 ns 474437.5 ns 0.96
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 245887 ns 244496.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 33424426 ns 33157318.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9545520.5 ns 9248750 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 887779 ns 888349 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 645812.5 ns 614125 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 575959 ns 648750 ns 0.89
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 640542 ns 652521 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 646271 ns 642542 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 208584 ns 208606.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8406939 ns 7841403 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1406395.5 ns 1401250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 315503 ns 305493 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2214979 ns 2245937.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2211999.5 ns 2247291 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2220812.5 ns 2238062.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2227958 ns 2241541 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 978439 ns 971988 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47363900 ns 48958299 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10481646 ns 7597458.5 ns 1.38
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1213952 ns 1213901.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18625 ns 19333 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20729 ns 21646 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21583 ns 21833 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18875 ns 24291 ns 0.78
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 113850.5 ns 111706.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3565557.5 ns 3500994.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 497958 ns 1437895.5 ns 0.35
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79731 ns 79141 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227375 ns 219459 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 259417 ns 219791.5 ns 1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 225541 ns 222104.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 227084 ns 219875 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 729838 ns 728212.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26163617 ns 26675294 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7560500 ns 7278312 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 554315 ns 555140 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 584 ns 584 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 541 ns 667 ns 0.81
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23274 ns 22972 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1191789 ns 1186538 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 484250 ns 461542 ns 1.05
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 48040 ns 49541 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9083 ns 9750 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10437.5 ns 9333.5 ns 1.12
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9541 ns 9896 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9500 ns 10000 ns 0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 268183 ns 265448 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24685731.5 ns 24827341.5 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5000875 ns 6076333 ns 0.82
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 398234 ns 415154 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7250 ns 7917 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9187.5 ns 10208 ns 0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9645.5 ns 10542 ns 0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8041 ns 9292 ns 0.87
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 118921.5 ns 118520 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3382327 ns 3378687 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 886791.5 ns 891583 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 71801 ns 75371 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7604 ns 7291.5 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8125 ns 7875 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7500 ns 7833.5 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7562.5 ns 7708 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 507494 ns 503824 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17189656.5 ns 17507211 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 3782375 ns 4534375 ns 0.83
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 320313 ns 318933 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1500 ns 1437.5 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1708.5 ns 1667 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1791 ns 1917 ns 0.93
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1375 ns 1417 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 21598 ns 21272 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1189888 ns 1191094 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 313375 ns 307229 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 190932 ns 189132 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3541 ns 3292 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3583 ns 3333 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3458 ns 3500 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3292 ns 3500 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 218452 ns 216668.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 9603283 ns 10523301.5 ns 0.91
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1797375 ns 1655750 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 583116 ns 579466 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148104.5 ns 148229.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 106833 ns 106166.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 128562.5 ns 129250 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225000 ns 225167 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 23975 ns 23640 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1165725 ns 1169047 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 254292 ns 281229 ns 0.90
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 41470 ns 40580 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 157645.5 ns 143125 ns 1.10
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 87625 ns 87375 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 112000 ns 112875.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 250708.5 ns 250792 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 218220.5 ns 214898 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10460438 ns 10468792 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 1096666 ns 2056708 ns 0.53
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 269773 ns 266232 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 7208 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5333 ns 5375 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6083 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10458 ns 10000 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32755 ns 33010 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1178842 ns 1218913 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 330458 ns 357271 ns 0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50720 ns 50911 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 253104 ns 227938 ns 1.11
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229041.5 ns 228354.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 234187.5 ns 235708 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 227938 ns 249729 ns 0.91
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 263186.5 ns 263220 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27448206 ns 28851277 ns 0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8237750 ns 8089625 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 594190.5 ns 591956 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 13792 ns 15375 ns 0.90
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15166 ns 14917 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 16499.5 ns 16834 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 14667 ns 15583 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 139540 ns 138290 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5436668.5 ns 5390404 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 786729 ns 805167 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 232963 ns 231372.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23000 ns 23333 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23937.5 ns 23438 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23875 ns 24459 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23979.5 ns 23666 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 870094.5 ns 863635.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 40010466.5 ns 39146915 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5595708 ns 5702250 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 679366 ns 683727 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8750 ns 8875 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10312.5 ns 10041.5 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11271 ns 11750 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9584 ns 9917 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 123388.5 ns 122685 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3563169 ns 3570923 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 858292 ns 917271 ns 0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 74460 ns 75270 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13375 ns 14166 ns 0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14458.5 ns 14458.5 ns 1
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 13958 ns 14979.5 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13625 ns 13542 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 667308 ns 660959 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 21257602 ns 21424061 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 4997708 ns 5279979 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 365743 ns 365744 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8583 ns 8417 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10333 ns 10146 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10312.5 ns 12125 ns 0.85
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9166 ns 9792 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 121770.5 ns 121433.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3365145.5 ns 3352559.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 906625 ns 952146 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 75170 ns 72460 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12292 ns 13166 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13437.5 ns 12938 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12916 ns 13125 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12458 ns 12916 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 553718.5 ns 548948 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 18868109 ns 18645332 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 3865125.5 ns 4735063 ns 0.82
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 341293 ns 340583 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 26354.5 ns 31125.5 ns 0.85
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 30645.5 ns 31520.5 ns 0.97
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 31541 ns 32333.5 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1833 ns 1834 ns 1.00
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16183 ns 16210 ns 1.00
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 81001 ns 80860 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5209 ns 5229.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5021 ns 4959 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5417 ns 5250 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6604 ns 6334 ns 1.04
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 140577.5 ns 138594 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 370423.5 ns 388224 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 291 ns 0.86
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 250 ns 375 ns 0.67
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 291 ns 334 ns 0.87
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 25697 ns 25350 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1197018 ns 1199368 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 465667 ns 478250.5 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 47180 ns 49490 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6125 ns 6292 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6729 ns 6750 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6333 ns 6792 ns 0.93
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6312.5 ns 6584 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 187721.5 ns 186417 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 23736279.5 ns 23013025 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4952833.5 ns 5920458 ns 0.84
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 386429 ns 393209 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 1959 ns 1958 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2042 ns 2042 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2000 ns 2083 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 1959 ns 2000 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 26463 ns 25999.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1170027.5 ns 1183440.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 479625 ns 314229 ns 1.53
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 206252 ns 206522 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16250 ns 16583.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16666 ns 15958 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16208.5 ns 16854 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16417 ns 16791.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 276067 ns 272947 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24921263 ns 25132475.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5326083 ns 6200500 ns 0.86
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 700836 ns 699897 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 173875 ns 158000 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 148750 ns 152895.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 155708 ns 179875 ns 0.87
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 147458 ns 175625 ns 0.84
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 203847 ns 205507.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8347024.5 ns 8109426 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1561917 ns 1459854.5 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 232482 ns 213437 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1328917 ns 1279667 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1311771 ns 1336958 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1320791 ns 1276333 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1322500 ns 1332729.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 909940.5 ns 907688 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 44667022 ns 46524861.5 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7124333 ns 6921834 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 995559.5 ns 1109576 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22958 ns 25937.5 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 26833 ns 25750 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27625 ns 27437.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24667 ns 24042 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 234608.5 ns 236630 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7924652 ns 7924614 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 576541 ns 1195645.5 ns 0.48
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 116011 ns 112891.5 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 118166.5 ns 117812.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 122375 ns 125958 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 158041.5 ns 130667 ns 1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 123833.5 ns 132625 ns 0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1073695 ns 1078111.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44153968 ns 48454865.5 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6127166 ns 6291354 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 612925 ns 604836 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 250 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 250 ns 334 ns 0.75
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23160 ns 22703 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1212472 ns 1228350.5 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 478542 ns 303875 ns 1.57
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 47471 ns 47155.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6291 ns 6333 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6833.5 ns 6937.5 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6458 ns 6750 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6584 ns 6687.5 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 204382.5 ns 201918.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 24496787 ns 24022047 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5334937.5 ns 6154291 ns 0.87
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388703 ns 390799 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5208 ns 5584 ns 0.93
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7021 ns 6729 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7458 ns 7834 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5667 ns 6333 ns 0.89
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 145933.5 ns 144556.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5745568 ns 5802837 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 753959 ns 465083.5 ns 1.62
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 234802 ns 231623 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9583 ns 9875 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10375 ns 10500 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 10250 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10042 ns 10084 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 903827 ns 898422 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 42297357 ns 41540865 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5826479 ns 5925625 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 668457 ns 667721.5 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 625 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 709 ns 625 ns 1.13
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 625 ns 625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 667 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22371 ns 22281 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2015786 ns 2048848.5 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 208416 ns 228500 ns 0.91
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 207552 ns 205022 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4584 ns 4625 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4833 ns 4625 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4666 ns 4791 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4584 ns 4584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 228749 ns 224113.5 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10461831 ns 11648202 ns 0.90
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1654416.5 ns 1667208 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 580735 ns 578966 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7750 ns 8604.5 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9166.5 ns 9500 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8834 ns 10125 ns 0.87
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8291 ns 8125 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 121959 ns 121216 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3411255 ns 3493631.5 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 827916 ns 797562.5 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 74011 ns 73391 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 8166.5 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9041.5 ns 9020.5 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583.5 ns 9292 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8834 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 591884.5 ns 585686 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 20708574.5 ns 21659888 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4264875 ns 5138604.5 ns 0.83
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 342784 ns 345673 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 122750 ns 128166 ns 0.96
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 96459 ns 95895.5 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 130187.5 ns 130416 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 180875 ns 193500 ns 0.93
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45830 ns 45829 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 101721 ns 100941 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 328000 ns 335583 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 166666 ns 167167 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 347541.5 ns 354375 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 608646 ns 609249.5 ns 1.00
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 192063 ns 190876 ns 1.01
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 505519.5 ns 517555 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 395916 ns 397541 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 214250 ns 215333 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288167 ns 288458 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756500 ns 756458 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43676.5 ns 43687 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1411321 ns 1356444.5 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 429792 ns 420167 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 82131 ns 80321 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1458834 ns 1457000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 857583 ns 862125 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1134333 ns 1134520.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2441958.5 ns 2444500 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 249859 ns 251807.5 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 10370982 ns 10565821 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1909646 ns 1852750 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 352903 ns 350374 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 616500 ns 683334 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 598250 ns 650583 ns 0.92
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 648916.5 ns 641791.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 642667 ns 653250 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 200586.5 ns 202465 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7794534 ns 8364163.5 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1363291 ns 1384458 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 313733 ns 302773 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2445375 ns 2447209 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2426917 ns 2468625 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2441500 ns 2446166.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2440750 ns 2452188 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 994961 ns 992979 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 50766350 ns 51629265.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9661291 ns 9882875 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1307388 ns 1311863 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 28521 ns 34667 ns 0.82
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 34625 ns 34291.5 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 33916.5 ns 35521 ns 0.95
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 875 ns 875 ns 1
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15425.5 ns 15660 ns 0.99
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 79381 ns 78941 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3062.5 ns 3125 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3416 ns 3458.5 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3208 ns 3312.5 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3209 ns 3084 ns 1.04
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 139741 ns 137070.5 ns 1.02
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 338953 ns 338254 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 404500 ns 406166 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 402125 ns 404458 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 408334 ns 408458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 422458 ns 420458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 43145 ns 42995 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1417291 ns 1466063 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1128750.5 ns 1144125 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 239562 ns 238192 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3863292 ns 3877875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3971625 ns 3990896 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3996791 ns 3992562.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3757979.5 ns 3778146 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 242826 ns 240990 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 38623864 ns 36589646 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11673750 ns 11933709 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1433229 ns 1433854 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3959 ns 3916 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3916 ns 3917 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33968 ns 33931 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1232483 ns 1232713.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 167334 ns 183709 ns 0.91
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 38620 ns 38031 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15666 ns 15708 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15750 ns 15750 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15625 ns 15958 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15625 ns 15750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 255128 ns 252887 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8717525 ns 9179273 ns 0.95
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 843520.5 ns 893625 ns 0.94
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 169816.5 ns 172862 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 402625 ns 404417 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 220209 ns 221125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295959 ns 296500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760791.5 ns 761125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113239 ns 112867 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1047524 ns 1050270.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 348895.5 ns 406792 ns 0.86
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 89300.5 ns 87471 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1474958.5 ns 1471292 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 881146 ns 884000 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1159083.5 ns 1160146 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2461917 ns 2466083.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 241292 ns 238614 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 9318727.5 ns 9255273 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1946459 ns 1932833 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 354883 ns 350549 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 25844 ns 25487 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1200537.5 ns 1217335.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 496709 ns 387333 ns 1.28
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 209382 ns 206202 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7375 ns 7375 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8104.5 ns 8020.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7500 ns 7916 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7375 ns 7542 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 217033.5 ns 209854.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25754399 ns 25469136 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5254333.5 ns 6294375 ns 0.83
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 685977 ns 684857 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 825125.5 ns 833124.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 468584 ns 467292 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 621500 ns 621750 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1536542 ns 1543666 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130845.5 ns 130036 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 229862 ns 230222 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2661979 ns 2684437.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1535250.5 ns 1538583 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2000792 ns 2002583 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4906416 ns 4933354 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 242304 ns 243369 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 841449 ns 836303.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 250 ns 375 ns 0.67
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 291 ns 334 ns 0.87
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32216 ns 31581 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1218492 ns 1181114.5 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 464375 ns 425666.5 ns 1.09
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 47630 ns 49050 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6125 ns 6291 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6708.5 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6500 ns 6667 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 224154.5 ns 222549 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21407773 ns 20723673 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4615291 ns 5408500 ns 0.85
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 357793.5 ns 364253.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2392708 ns 2412916 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2371959 ns 2399708 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2404416 ns 2391250 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2370084 ns 2406375 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 200035.5 ns 201130.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7868335 ns 8039466.5 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1597041.5 ns 1500813 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 373933 ns 371169 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4648292 ns 4645417 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4644250 ns 4666145.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4636708 ns 4648375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4642750 ns 4646334 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 891890 ns 899895.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 46027858 ns 47712828 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6938541.5 ns 6893375 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1391633 ns 1384804 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 7187.5 ns 7083 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7542 ns 7000 ns 1.08
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7125 ns 7750 ns 0.92
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6875 ns 6792 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 23289 ns 23107 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1167669 ns 1160499 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 243458.5 ns 282458 ns 0.86
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 39800 ns 40431 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 46396.5 ns 48667 ns 0.95
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 32917 ns 57125 ns 0.58
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 45875.5 ns 51042 ns 0.90
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 67312 ns 33354.5 ns 2.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 214725 ns 215404 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10485830 ns 10709204 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 1121562 ns 2066833 ns 0.54
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 269102.5 ns 264313 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 19604.5 ns 22854 ns 0.86
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 24021 ns 24375.5 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 23750 ns 24917 ns 0.95
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5084 ns 5209 ns 0.98
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17227 ns 16790 ns 1.03
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 83741 ns 89191 ns 0.94
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11916 ns 12250 ns 0.97
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 9354.5 ns 9375 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10417 ns 10604.5 ns 0.98
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17958 ns 18083 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 225890 ns 225960 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 371753 ns 387419 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404000 ns 406584 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 222584 ns 223292 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 296875 ns 297000 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762667 ns 762667 ns 1
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46288 ns 45879 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1401617.5 ns 1417981 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 358375 ns 424354.5 ns 0.84
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 89491 ns 89741 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1480896 ns 1486000.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 888250 ns 892208.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1164959 ns 1169500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2465417 ns 2471625 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 288016 ns 279157 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 12678894 ns 13109750 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2117375 ns 2047333 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 381744 ns 376633 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 432125 ns 433500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 430333 ns 430292 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 436917 ns 436292 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 448604.5 ns 446958 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54122.5 ns 54004 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1002212 ns 1003277 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1059021 ns 1090562.5 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 234952 ns 236733 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3895042 ns 3866292 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4004458 ns 4019812.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4030291.5 ns 4022583.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3789979 ns 3812208.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 260055 ns 261348.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 30675954 ns 32496173.5 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10349458.5 ns 10504750 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1223712 ns 1365148 ns 0.90
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8750 ns 8708 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 6917 ns 6958 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7583 ns 7667 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12416 ns 12417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 23553.5 ns 23411 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2134096 ns 2120051 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 214667 ns 229334 ns 0.94
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 211142 ns 208012 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 44958 ns 45583 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45083 ns 45291 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45000 ns 45416 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 44958 ns 45042 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 344550 ns 345424.5 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 14001329.5 ns 13588599 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1862458 ns 1751750 ns 1.06
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 659011.5 ns 653876 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 122729 ns 113812.5 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83521 ns 90020.5 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 87354.5 ns 88625 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 105375 ns 81000 ns 1.30
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190055 ns 190227.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5969481 ns 6167893 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1972791.5 ns 2705500 ns 0.73
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 214447 ns 221462 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2012458.5 ns 1871229 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1980000 ns 2028479 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2023917 ns 2015645.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2011645.5 ns 2020395.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 529776 ns 534895 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 29142428 ns 28188330 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9305500.5 ns 9724208 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1088680 ns 1078565.5 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.