Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: enzyme reverse bias needs a check on Const
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 16, 2024
1 parent 7ba127a commit 0df09fa
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.2.2"
version = "1.2.3"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
10 changes: 5 additions & 5 deletions src/impl/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,17 +196,17 @@ for (f, dfdx) in [
(:tanh_fast, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω)))))
#! format: on
]
@eval CRC.@scalar_rule($f(x), $dfdx)
@eval CRC.@scalar_rule($f(x), $(dfdx))

∇f = Symbol(:∇broadcasted_, f)
@eval function CRC.rrule(::typeof(Broadcast.broadcasted), ::typeof($f),
x::Union{Numeric, Broadcast.Broadcasted})
Ω = $f.(x)
function $∇f(dΩ)
∂x = CRC.InplaceableThunk(dx -> @.(dx+=* $dfdx), CRC.@thunk @.(dΩ*$dfdx))
Ω = $(f).(x)
function $(∇f)(dΩ)
∂x = CRC.InplaceableThunk(dx -> @.(dx+=* $(dfdx)), CRC.@thunk @.(dΩ*$(dfdx)))
return CRC.NoTangent(), CRC.NoTangent(), ∂x
end
return Ω, $∇f
return Ω, $(∇f)
end
end

Expand Down
4 changes: 2 additions & 2 deletions src/impl/batched_mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ for func in (NNlib.batched_mul!, batched_matmul_loopvec_impl!)
end

dCs = C.dval
dAs = (typeof(A) <: EnzymeCore.Const) ? dCs : A.dval
dBs = (typeof(B) <: EnzymeCore.Const) ? dCs : B.dval
dAs = A isa EnzymeCore.Const ? dCs : A.dval
dBs = B isa EnzymeCore.Const ? dCs : B.dval

if EnzymeRules.width(cfg) == 1
dCs = (dCs,)
Expand Down
6 changes: 3 additions & 3 deletions src/impl/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,9 @@ function EnzymeRules.reverse(cfg, ::EnzymeCore.Const{typeof(matmuladd!)},
end

∂Cs = C.dval
∂As = (typeof(A) <: EnzymeCore.Const) ? ∂Cs : A.dval
∂Bs = (typeof(B) <: EnzymeCore.Const) ? ∂Cs : B.dval
∂bs = bias.dval
∂As = A isa EnzymeCore.Const ? ∂Cs : A.dval
∂Bs = B isa EnzymeCore.Const ? ∂Cs : B.dval
∂bs = bias isa EnzymeCore.Const ? ∂Cs : bias.dval

if EnzymeRules.width(cfg) == 1
∂Cs = (∂Cs,)
Expand Down

3 comments on commit 0df09fa

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/115299

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.2.3 -m "<description of version>" 0df09fa8137f05c8958f96352c4672f577d9f346
git push origin v1.2.3

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 0df09fa Previous: 7ba127a Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6938 ns 4667 ns 1.49
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7438 ns 6666.5 ns 1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7541 ns 7500 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5750 ns 5750 ns 1
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 133931 ns 117321 ns 1.14
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2868757 ns 2723919 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 741167 ns 3008750 ns 0.25
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 407074 ns 404195 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9916.5 ns 9896 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9625 ns 9833 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9937.5 ns 9979 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9916.5 ns 9958.5 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 536526 ns 533872 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 17845684 ns 18512917 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2422500 ns 2324292 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 678976 ns 674968 ns 1.01
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1583 ns 1437.5 ns 1.10
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3145.5 ns 2875 ns 1.09
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 2812.5 ns 2083 ns 1.35
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1541.5 ns 1437.5 ns 1.07
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 21370 ns 21479 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1416739 ns 1282166 ns 1.10
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 237500 ns 190209 ns 1.25
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 29161 ns 29540 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4166 ns 4250 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4291 ns 4167 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4417 ns 4145.5 ns 1.07
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4104 ns 4375 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 143094 ns 144438.5 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 9766798.5 ns 9108147.5 ns 1.07
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1569250 ns 1604875 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 144301 ns 145092 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58000 ns 55875 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46834 ns 39209 ns 1.19
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46584 ns 46625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82333 ns 84167 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36625 ns 36824 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 686115 ns 542002 ns 1.27
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1069291 ns 1333104 ns 0.80
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 78821 ns 81391 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2031375 ns 2024917 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2084708 ns 2079125 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090291 ns 2081625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1985542 ns 1993125 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 225038 ns 226688 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 8235886 ns 7623752 ns 1.08
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 5106125 ns 7427958 ns 0.69
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 987279 ns 1252074 ns 0.79
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 174500 ns 174750 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 162104.5 ns 164541.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 165229 ns 148812.5 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145875 ns 144375 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165145 ns 165480 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8411274 ns 7680925 ns 1.10
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1520666 ns 1457521 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 209957 ns 204852 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1119979 ns 1117250 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1112166.5 ns 1109375.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1117709 ns 1113334 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1107125 ns 1112187.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 687949 ns 694582 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35372606 ns 33705507.5 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6112291 ns 6238375 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1024164.5 ns 1026961 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4625.5 ns 4417 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5104 ns 5041 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5583 ns 5208 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5042 ns 4583 ns 1.10
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 92273 ns 93299.5 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5823843 ns 5368327 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 499583.5 ns 634041.5 ns 0.79
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 67701 ns 69460 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 8375 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8542 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9187.5 ns 8833 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8417 ns 8833 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 600949 ns 604485 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 36561430 ns 36365543 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5960250 ns 5669937.5 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 389274 ns 388374 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19625 ns 17000 ns 1.15
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17791 ns 17709 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20291 ns 18021 ns 1.13
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16645.5 ns 16895.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 65239 ns 66654.5 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3323140 ns 2923981.5 ns 1.14
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1293104 ns 477833 ns 2.71
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 73656 ns 78451 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220959 ns 216834 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212333 ns 219896 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212541 ns 225583.5 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212000 ns 217625 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 347340 ns 356473 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 13974103 ns 14201022 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5755333 ns 5644395.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 462604 ns 465005 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 666 ns 667 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 833.5 ns 750 ns 1.11
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 875 ns 812.5 ns 1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 584 ns 625 ns 0.93
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 20357 ns 20462 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1288251 ns 1162134.5 ns 1.11
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 292667 ns 302625 ns 0.97
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 31491 ns 32870 ns 0.96
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1416.5 ns 1417 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1416 ns 1458 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1625 ns 1417 ns 1.15
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1416 ns 1416 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 123399.5 ns 125127 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 9450809 ns 8831211 ns 1.07
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1493229 ns 1526500 ns 0.98
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 135231 ns 136521 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7500 ns 7208 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6042 ns 5416 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6125 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10666 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23818 ns 23625 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1331154.5 ns 1207481 ns 1.10
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 628937.5 ns 356458 ns 1.76
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46911 ns 48881 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219750 ns 226166 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 265167 ns 265333 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 264416 ns 234854 ns 1.13
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 249854 ns 219500 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 189311.5 ns 192027 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 33158982 ns 31211143.5 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9299979.5 ns 9046313 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 643876 ns 649247 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4125 ns 4083 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4083 ns 4084 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4083 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23427 ns 23477 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2124740.5 ns 2001417 ns 1.06
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 222770.5 ns 214833 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 46290 ns 47261 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16833 ns 17083 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16792 ns 17000 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16750 ns 16833 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16792 ns 17334 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 191493 ns 195303 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 11757211 ns 14536946 ns 0.81
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 955313 ns 918208 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 171341.5 ns 174652 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 511167 ns 508750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 405458 ns 330583 ns 1.23
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 405000 ns 404666 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 858250 ns 864791 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113156 ns 113620 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 448835 ns 401393 ns 1.12
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 471209 ns 490979 ns 0.96
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 240532 ns 242133 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2268250 ns 2313834 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2031416 ns 1747479 ns 1.16
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2030917 ns 2035208 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3275750 ns 3272708.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 236871 ns 241207 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 10359638.5 ns 10021457.5 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 1993250 ns 2011770.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 739142 ns 743443 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6583 ns 4708.5 ns 1.40
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6875 ns 7625 ns 0.90
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7709 ns 7708 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6292 ns 5479.5 ns 1.15
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 90224.5 ns 92351.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5882879 ns 5442998 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 771000 ns 783479 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 65250 ns 65411 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12333.5 ns 10333.5 ns 1.19
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11375 ns 11875 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11312.5 ns 11750 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11833.5 ns 12062.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 622443 ns 634956 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 41746922 ns 40400531.5 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5637750 ns 5457291.5 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 407854 ns 409979.5 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 541 ns 541 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 583 ns 0.86
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 22944 ns 23181 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2423476.5 ns 2216579 ns 1.09
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 326750 ns 332584 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 48960 ns 47221 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2166 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 217144 ns 215755 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 12060454 ns 11357397.5 ns 1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 1960083 ns 1978417 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 180236.5 ns 172626.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8625 ns 8937.5 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9646 ns 9729.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11229 ns 9459 ns 1.19
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8792 ns 8958 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 103267 ns 96639 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3427494 ns 3207607 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 875083 ns 876000 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 73431 ns 71941 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17834 ns 18521 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17916 ns 19104.5 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17333 ns 17625 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18000 ns 18812.5 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 586862 ns 554001 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 17435012.5 ns 16517942.5 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5223458 ns 5180916.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 377954 ns 378539 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 458 ns 1.27
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 666 ns 0.81
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34849.5 ns 35213 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1279718 ns 1186873 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 435291 ns 466396 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 45841 ns 46270 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8979.5 ns 9312.5 ns 0.96
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9250 ns 9916.5 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 8917 ns 9167 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8146 ns 9458.5 ns 0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 260579 ns 267136 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19733483 ns 18948901 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4985875 ns 4572250 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 366004 ns 367694 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 398667 ns 395333 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 287958 ns 214416 ns 1.34
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287750 ns 288292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756458 ns 756291 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111261.5 ns 111882 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 376549 ns 329474.5 ns 1.14
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 367583.5 ns 300208.5 ns 1.22
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 74430 ns 77331 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1400375 ns 1453791.5 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1135375 ns 852583 ns 1.33
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1132354 ns 1132645.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2440958 ns 2440625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 203910 ns 207032 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 9225527 ns 10204120 ns 0.90
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1662875 ns 1668041.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 321818 ns 324428.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7604.5 ns 7041.5 ns 1.08
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8083 ns 7750 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8729 ns 9396 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7437.5 ns 7791.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 142785 ns 144806.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 6299176.5 ns 5813106.5 ns 1.08
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 521292 ns 437250 ns 1.19
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 65420 ns 66071 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12583 ns 13083 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 12437.5 ns 14479 ns 0.86
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14521 ns 15709 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14979.5 ns 15354.5 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 943733.5 ns 956377 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 47612069 ns 42729213 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5885062.5 ns 5700250 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 417444 ns 428955 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 30395.5 ns 24000 ns 1.27
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 29604 ns 24875 ns 1.19
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27709 ns 29292 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 25083.5 ns 27667 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 195905 ns 199144 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8216412 ns 7744284 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 990125 ns 999584 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 116401 ns 116931 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 154583.5 ns 103583 ns 1.49
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 155500 ns 152687 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 114042 ns 153583 ns 0.74
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 113187.5 ns 151000 ns 0.75
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1061855 ns 1075746 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 46328998 ns 43042130 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5883041 ns 5733792 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 586901 ns 590946.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74459 ns 75000 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75833 ns 77084 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78208 ns 86333.5 ns 0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 75958 ns 74875 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 203068 ns 205585 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7813436 ns 8027595.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 533437.5 ns 519187.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 127391 ns 127562 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 298166 ns 293542 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 303208 ns 308750 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 306041.5 ns 315187.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 295666 ns 304208 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1104226 ns 1108118 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44772773.5 ns 40422383 ns 1.11
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6766000 ns 6276458 ns 1.08
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 694176 ns 695017 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 17000 ns 15875 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17292 ns 17521 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 18375 ns 18500 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16792 ns 16958 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 145201.5 ns 146489 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 6348029 ns 5586208 ns 1.14
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 448000.5 ns 723083.5 ns 0.62
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 231113 ns 232683 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27208 ns 26667 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 28625 ns 26687.5 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27187.5 ns 28208.5 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26145.5 ns 27708.5 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 972527 ns 982068.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 44334727.5 ns 40344043 ns 1.10
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5935916 ns 5743229 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 684627 ns 686807.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11375 ns 11083 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11625 ns 12042 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14042 ns 12334 ns 1.14
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 10416 ns 10791 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 123261.5 ns 124134 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3725175 ns 3473152 ns 1.07
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 904958 ns 880000 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 233272 ns 234213 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22000 ns 21958 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21666 ns 22729.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21542 ns 21895.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21916 ns 22000 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 697545 ns 701831.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 22814286 ns 21157140 ns 1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5479812.5 ns 5204750 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 668531 ns 674667 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 67459 ns 63437.5 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 63625 ns 65521 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 65084 ns 66750 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 62667 ns 63042 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 105558.5 ns 106345.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3699497 ns 3373870 ns 1.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1336625 ns 480667 ns 2.78
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 231652 ns 233433 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 450250 ns 437896 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 451792 ns 456000 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 446041.5 ns 450542 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 484250 ns 444000 ns 1.09
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 508079 ns 515188 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 22280153.5 ns 21597008 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6164479 ns 6095791.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 712097 ns 717017.5 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7667 ns 6792 ns 1.13
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8458 ns 8000 ns 1.06
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8041.5 ns 8583.5 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7083.5 ns 6917 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 142974 ns 146052.5 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5983895.5 ns 5510181.5 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 687104.5 ns 726500 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 68961 ns 65301 ns 1.06
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14333 ns 14292 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14312 ns 15292 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15021 ns 14084 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15250 ns 16209 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 941966 ns 947670 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 40659493.5 ns 39845105 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5744375 ns 5499875 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 395784 ns 399764 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6161520.5 ns 6131500 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6378125.5 ns 3224875 ns 1.98
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6377708.5 ns 6379229.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11920959 ns 11911084 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 347985 ns 349856 ns 0.99
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 320268 ns 303248 ns 1.06
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19132416 ns 19059708.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 20009458 ns 11090437.5 ns 1.80
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19937708 ns 20005646 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36464229.5 ns 36446770.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1013485 ns 1081781.5 ns 0.94
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1165921 ns 1153782 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 917 ns 958 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1000 ns 1000 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 917 ns 958 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 958 ns 917 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23221 ns 23071 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2197390 ns 2085318 ns 1.05
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 332458.5 ns 332541.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 205762 ns 207622 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3667 ns 3667 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3709 ns 3750 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3667 ns 3708 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3667 ns 3667 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 277792 ns 281551.5 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 12494000 ns 12095727 ns 1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2076312.5 ns 2129583 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 624236 ns 626307 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8792 ns 8042 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8875.5 ns 8145.5 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9875 ns 9042 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7625 ns 7937.5 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 119047.5 ns 121104 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3910252.5 ns 3679976 ns 1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 795416.5 ns 802541.5 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 65320 ns 65471 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11374.5 ns 13125 ns 0.87
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12208 ns 12875 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11792 ns 11417 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11979.5 ns 12708 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 629697.5 ns 638151 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 23515262 ns 22685670 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5019875 ns 4390333 ns 1.14
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 352263 ns 355644 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22203 ns 22337 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2289294 ns 2195388.5 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 228916 ns 207833 ns 1.10
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 46161 ns 47401 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3084 ns 3042 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2959 ns 3375 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2917 ns 2916 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2875 ns 3333 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 200155 ns 204047 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9757264 ns 14763707.5 ns 0.66
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1632083 ns 1611395.5 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 153411.5 ns 157641.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11563 ns 10250 ns 1.13
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11334 ns 12167 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12292 ns 12187.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10854 ns 10604 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 120519 ns 121713.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3640370.5 ns 3281210 ns 1.11
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 897667 ns 904791.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 232282 ns 233512.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20750 ns 21104.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21083 ns 22583 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21959 ns 21083 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21458.5 ns 21708 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 590202 ns 595173 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 22574638 ns 20531194.5 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4746958.5 ns 4095583 ns 1.16
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 639216 ns 638246.5 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 23877 ns 24193.5 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2442376 ns 2211530 ns 1.10
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 225708 ns 215041 ns 1.05
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 46800 ns 47690 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16291 ns 16292 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16625 ns 16291 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16459 ns 16667 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16500 ns 16416 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 326023.5 ns 330020.5 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 13171553 ns 12280627 ns 1.07
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1188229 ns 1639709 ns 0.72
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 205042 ns 206457.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2042 ns 1917 ns 1.07
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2083 ns 2167 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2083 ns 2084 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2084 ns 2084 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35572 ns 35891 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1338351 ns 1213015 ns 1.10
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 435459 ns 474917 ns 0.92
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 202812 ns 204052 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 16520.5 ns 19687.5 ns 0.84
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 17104.5 ns 17187.5 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 18375 ns 17750 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 18770.5 ns 16667 ns 1.13
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 291395 ns 293976.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 23003699 ns 21212198 ns 1.08
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5678333 ns 4767354.5 ns 1.19
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 682086 ns 686777 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 58979 ns 55771 ns 1.06
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 67125 ns 62792 ns 1.07
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 66917 ns 65604.5 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51625 ns 51333 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66452 ns 66418 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 114721 ns 114241 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 162292 ns 202896 ns 0.80
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 147229 ns 135104 ns 1.09
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 130229 ns 130083 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 296770.5 ns 245666 ns 1.21
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 213701 ns 215296 ns 0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 607926 ns 607861 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 84250 ns 79709 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 124729 ns 107104 ns 1.16
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85875 ns 85167 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 123833 ns 124166.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193440 ns 192861 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7291287 ns 5531381 ns 1.32
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1831167 ns 1816084 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203522 ns 203512 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1928271 ns 1869895.5 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1891125 ns 1901084 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1902250 ns 1917666.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1914749.5 ns 1889333 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 525346 ns 531825 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 26967967.5 ns 32650285 ns 0.83
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9298209 ns 8859584 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 927389 ns 925670 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 291 ns 291 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21417 ns 21389 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2392141 ns 2065883 ns 1.16
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 342188 ns 336229.5 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 42200 ns 42770.5 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1791 ns 1792 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 249016 ns 253832 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 10390055 ns 10417238 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1093187.5 ns 1009479 ns 1.08
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 179602 ns 184376.5 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9667 ns 8000 ns 1.21
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10125 ns 10042 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10249.5 ns 10375 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9375 ns 8167 ns 1.15
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 118409 ns 119090.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3710566 ns 3309191 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 886083.5 ns 876708 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 231452 ns 232622 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9209 ns 9083 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10000 ns 10625 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9770.5 ns 9542 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 10125 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 517575.5 ns 527209 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21956361 ns 22247571 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4314937.5 ns 3949187.5 ns 1.09
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 624606 ns 624237 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58209 ns 56166 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46542 ns 38916 ns 1.20
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46750 ns 46125 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83000 ns 83958 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 39682 ns 40233 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1450337.5 ns 1343252 ns 1.08
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1115958 ns 1123667 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 74661 ns 76266 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1939500 ns 1923750 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1983125 ns 1952750.5 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1951312.5 ns 1982854 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1897667 ns 1850708.5 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 216819.5 ns 221906.5 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 37812796.5 ns 33376877 ns 1.13
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10968478.5 ns 11408021 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1185212 ns 1191052 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 417625 ns 416333 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 419834 ns 421645.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 420958 ns 421208.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 417208 ns 417667 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 204963.5 ns 208798 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8983027 ns 7659621 ns 1.17
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 546875 ns 518208 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 280603 ns 282883 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 669791.5 ns 747916.5 ns 0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 780667 ns 671583 ns 1.16
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 689645.5 ns 673562.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 725292 ns 748021 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1038703 ns 1048327.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 49679972 ns 45569778.5 ns 1.09
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6487209 ns 6335208.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 909389 ns 914290 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3413542 ns 3428937.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3417875 ns 3384709 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3420479 ns 3435000 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3414187 ns 3417875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168543 ns 175238.5 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8597060 ns 8069034 ns 1.07
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1366458.5 ns 1424083 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 434404 ns 426124 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6191104 ns 6191270.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6232645.5 ns 6170041 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6213854 ns 6167416.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6216250 ns 6190792 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 979877 ns 994959 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 50928344 ns 50094330 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7557875 ns 7413750 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1538944 ns 1549811 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 471584 ns 470666 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 341687.5 ns 252458 ns 1.35
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 340375 ns 342417 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 902500 ns 901125 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46568 ns 46139 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 450349 ns 884569 ns 0.51
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 504562.5 ns 368208 ns 1.37
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 241952 ns 243602 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2276916 ns 2334750 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2038666 ns 1752562 ns 1.16
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2034583 ns 2041187.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3280958 ns 3280124.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 253153 ns 255952 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 14086050 ns 12850913 ns 1.10
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2208291.5 ns 2244770.5 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 765407 ns 770018 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57959 ns 55708 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46250 ns 39041 ns 1.18
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46250 ns 46020.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82792 ns 84125 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28134 ns 28321 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1575508 ns 1407008 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1135958 ns 1106875 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 73405.5 ns 76505.5 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1962520.5 ns 2029708 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2093312.5 ns 2082292 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2086834 ns 2090958 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2000458.5 ns 1949604 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 229351 ns 232547 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 38934321 ns 35887652 ns 1.08
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11662250 ns 11649979 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1196771 ns 1052311 ns 1.14
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58208 ns 55833 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46812.5 ns 39083.5 ns 1.20
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46708 ns 46375 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82375 ns 84042 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 49491 ns 49287 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 947062 ns 790006.5 ns 1.20
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1068833 ns 1049084 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 77751 ns 69820 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1937792 ns 1919458 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1974209 ns 1955416.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1960000 ns 1946334 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1899959 ns 1890750 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 235535 ns 239685 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 22349832.5 ns 17609091 ns 1.27
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9994166 ns 9788042 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 915999 ns 918859 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 417 ns 0.80
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34420 ns 34717 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1328125 ns 1181143 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 278292 ns 263500 ns 1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 45880 ns 46211 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6541 ns 6333 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6917 ns 7500 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6584 ns 6583 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6458 ns 7000 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 209753 ns 208392.5 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 22541718 ns 20162243 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4971437.5 ns 4479667 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 368183 ns 365124 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31457 ns 32562 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1340759 ns 1251080 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 258291 ns 258000 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 36451 ns 37000 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3458 ns 2750 ns 1.26
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3292 ns 3625 ns 0.91
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2917 ns 2709 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2917 ns 2917 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 185714.5 ns 189309.5 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 8803725 ns 7798739 ns 1.13
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 950374.5 ns 905666.5 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 150601 ns 151136.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 424603.5 ns 467667 ns 0.91
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 425000 ns 444750 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 430459 ns 425999.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 443562.5 ns 421833.5 ns 1.05
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 136540 ns 137895 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6325011.5 ns 5774821 ns 1.10
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2056896 ns 2386500 ns 0.86
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 365713 ns 367024 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3790417 ns 3802521 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3803834 ns 3765917 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3804250 ns 3811417 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3813000 ns 3799541.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 699295 ns 709425 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34149887.5 ns 33554230 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11037916.5 ns 10457896 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1464794 ns 1471404 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49877979 ns 49735229.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35522250 ns 25984959 ns 1.37
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35535229 ns 35560875 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96934583 ns 96902041.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1591242 ns 1616773 ns 0.98
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1047550 ns 1045271 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154708541.5 ns 153907333 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112454083.5 ns 89247291.5 ns 1.26
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112480333 ns 112379750 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 296379229 ns 294166500 ns 1.01
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6494323.5 ns 6515848 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5551012.5 ns 5562255.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 19062.5 ns 14521 ns 1.31
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 17833.5 ns 14958 ns 1.19
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 17041 ns 16833 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15875 ns 14854.5 ns 1.07
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 21028 ns 20539.5 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1230713 ns 1114507 ns 1.10
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 219604.5 ns 206959 ns 1.06
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 25950 ns 26060 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10958 ns 10625 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 9041 ns 7771 ns 1.16
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9041.5 ns 9208 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17375 ns 17437.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 257331 ns 260548 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 10803925 ns 9528073.5 ns 1.13
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1552917 ns 1587125 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 147801 ns 149326.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 9354.5 ns 7958 ns 1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 10000 ns 9292 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10458 ns 9500 ns 1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7458 ns 7958.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 114779 ns 116273.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3881100 ns 3476228 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 797833 ns 810375 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 233502 ns 233683 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9916 ns 9208.5 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9708 ns 10645.5 ns 0.91
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9334 ns 10208 ns 0.91
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9709 ns 10375 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 616669 ns 619508.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25342914 ns 22906068.5 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 4989750 ns 4432792 ns 1.13
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 651926 ns 654786 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10583 ns 8291.5 ns 1.28
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9146 ns 10459 ns 0.87
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10584 ns 10042 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9875 ns 9250 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 120200.5 ns 120531 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3758991 ns 3436472 ns 1.09
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 905750 ns 901792 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 71611 ns 71071 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13541 ns 13250 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 15500 ns 16042 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 15458.5 ns 17208 ns 0.90
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 18125 ns 15167 ns 1.20
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 585824 ns 592138 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21389400 ns 18951458.5 ns 1.13
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4649750 ns 4027062.5 ns 1.15
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 343933 ns 345753 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 584 ns 583 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34550 ns 34521 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1371228 ns 1191899 ns 1.15
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 447645.5 ns 371562.5 ns 1.20
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 203956.5 ns 206352 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8270.5 ns 7062.5 ns 1.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8708 ns 8333.5 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9167 ns 8583 ns 1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10625 ns 8000 ns 1.33
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 231015.5 ns 233771 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 24528595.5 ns 23357164 ns 1.05
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5171458.5 ns 4885833 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 654796 ns 662116 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16167 ns 12292 ns 1.32
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 15895.5 ns 13229 ns 1.20
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 15979 ns 15125 ns 1.06
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 11875 ns 10167 ns 1.17
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 21988 ns 22042 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1304948 ns 1119591.5 ns 1.17
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 257646 ns 189125 ns 1.36
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 184412 ns 189132 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 32084 ns 31875 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 31875 ns 32333.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32250 ns 32291.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 31708 ns 32000 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 271511.5 ns 276327 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12350146 ns 12201192 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1659167 ns 1697542 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 587425 ns 595015.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 504958 ns 480875 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 481520.5 ns 441083 ns 1.09
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 443208 ns 450250 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 488374.5 ns 490979 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 195092 ns 194024 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6520561 ns 5766516 ns 1.13
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1945520.5 ns 2629708 ns 0.74
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 367668 ns 368063.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3839417 ns 3822958 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3824437.5 ns 3807354 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3828250 ns 3827834 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3827604.5 ns 3826167 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 535436 ns 544349 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32985580 ns 29050298 ns 1.14
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9639667 ns 9196542 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1204966.5 ns 1359983 ns 0.89
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 781980875 ns 838219667 ns 0.93
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 543423875 ns 415052604.5 ns 1.31
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 542625875 ns 543102500 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1559677978.5 ns 1525021500 ns 1.02
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22745322 ns 22764607.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14786409 ns 14772276 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2528971583 ns 3570164958 ns 0.71
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 2254450917 ns 1502049709 ns 1.50
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 2476668541 ns 2269221042 ns 1.09
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 6300456542 ns 4773617583 ns 1.32
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 366701385 ns 369302709 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88751089 ns 87924411 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75666 ns 79646 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 79041.5 ns 78895.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 79458.5 ns 78667 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 76208 ns 77583 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 203948 ns 207237 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 9083475 ns 7871351 ns 1.15
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 526062.5 ns 520375 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 106536 ns 107601 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 270270.5 ns 250834 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 292875 ns 294583.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 198312 ns 285708.5 ns 0.69
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 194667 ns 222333.5 ns 0.88
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1034833 ns 1049109.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 46783284 ns 43337417.5 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6115521 ns 6122958 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 633286 ns 640576 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199771000 ns 199656458.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 138674666 ns 103769666.5 ns 1.34
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 138669167 ns 139342042 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388512334 ns 388182208 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5812826 ns 5838796 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3596784 ns 3577840.5 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 621035604.5 ns 616451521 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439829542 ns 351188291.5 ns 1.25
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 440801667 ns 439680896 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1196350375 ns 1178137125 ns 1.02
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26769444 ns 26651952 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21887487 ns 22092888 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7333 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6042 ns 5292 ns 1.14
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6084 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9875 ns 10167 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27497 ns 27714.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1432348 ns 1202781 ns 1.19
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 374083 ns 351458 ns 1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46690 ns 48481 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216042 ns 218291.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 224375 ns 222250 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220687.5 ns 221209 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 208062.5 ns 213708.5 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 218341 ns 222292 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 35298334.5 ns 31765824 ns 1.11
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9155708 ns 9125125 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 528325 ns 529665 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9354.5 ns 7271 ns 1.29
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9396 ns 9541.5 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9750 ns 9791 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7958.5 ns 8187.5 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 118295 ns 117715.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3790588 ns 3188633 ns 1.19
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 873834 ns 885458 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 69600 ns 69700 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8562.5 ns 7479 ns 1.14
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9834 ns 10479.5 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9500 ns 10875 ns 0.87
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 12312.5 ns 8875 ns 1.39
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 512184 ns 519786.5 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21450760 ns 18597573.5 ns 1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4433459 ns 3961208 ns 1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 315553 ns 316073 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 416 ns 1.30
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 709 ns 750 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 458 ns 500 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 26098 ns 26338 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1299422.5 ns 1200694 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 479708.5 ns 488604.5 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 46840 ns 46820 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9167 ns 9291 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 11416 ns 10416 ns 1.10
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 11062.5 ns 9208.5 ns 1.20
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9416 ns 11583 ns 0.81
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 251250 ns 253612 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 25886430.5 ns 25803867.5 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5832146.5 ns 5171833.5 ns 1.13
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 387883 ns 388624 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 107916 ns 104834 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 99250 ns 84834 ns 1.17
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 100645.5 ns 99500 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146583 ns 146333 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 24989 ns 24613 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 1282751 ns 1194962 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 267229.5 ns 246062.5 ns 1.09
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 189842 ns 192062 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 514208 ns 526854 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 478541.5 ns 478875 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 478375 ns 500416.5 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 482875 ns 478958.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 229903 ns 232619 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 12990087 ns 11733131 ns 1.11
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2133042 ns 1709625 ns 1.25
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 608146 ns 610896 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5666 ns 5125 ns 1.11
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 7250 ns 7167 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 6291 ns 6791 ns 0.93
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 6625 ns 4042 ns 1.64
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16240.5 ns 16580 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 79631 ns 79701 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 12417 ns 11708 ns 1.06
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 11167 ns 11584 ns 0.96
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 12041.5 ns 10792 ns 1.12
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 16416.5 ns 17687.5 ns 0.93
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 211157 ns 214143.5 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 375234 ns 366964 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 39750 ns 35792 ns 1.11
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 52000 ns 50791 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 53021 ns 51833.5 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 16042 ns 13542 ns 1.18
batchedmm(16, Bsize=128)/forward/GPU/CUDA 19539 ns 21568 ns 0.91
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 90780.5 ns 87241 ns 1.04
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 42917 ns 38979.5 ns 1.10
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 32167 ns 30708 ns 1.05
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 32875 ns 30416 ns 1.08
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57042 ns 58458 ns 0.98
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 190769.5 ns 192010 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 392564 ns 395119 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1833.5 ns 1729.5 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 1875 ns 1
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2083 ns 2146 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1792 ns 1709 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 20462 ns 20594 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1239481 ns 1163029.5 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 307042 ns 326833 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 31870 ns 33120 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2125 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2208 ns 2333 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2291 ns 2250 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2208 ns 2042 ns 1.08
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 201344.5 ns 204587 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 10165131 ns 9292587 ns 1.09
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1570917 ns 1518500 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 136316.5 ns 136826.5 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6520.5 ns 4417 ns 1.48
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5000 ns 5250 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5625 ns 6375.5 ns 0.88
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5500 ns 4041.5 ns 1.36
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 143896 ns 145077 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 6277095.5 ns 5424296 ns 1.16
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 750374.5 ns 725208 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 69261 ns 69471 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8645.5 ns 8041 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8583.5 ns 8958 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9291.5 ns 8416 ns 1.10
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8020.5 ns 9208 ns 0.87
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 867420 ns 875812.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 42275328 ns 40742928.5 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5663374.5 ns 5580917 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 387123 ns 389804 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56875 ns 56792 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57833 ns 56875 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57750 ns 57584 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58375 ns 58375 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36655 ns 37054 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1241845 ns 1234596.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 541750 ns 336000 ns 1.61
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 202922 ns 203242 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 468937.5 ns 485813 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 477229.5 ns 499958.5 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 464541 ns 468208 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 433625 ns 438854.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 263574 ns 268055 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28829027 ns 27322975 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8162250 ns 8122166.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 827187.5 ns 832729 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3317521 ns 3291250 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2329500 ns 1764708 ns 1.32
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2336167 ns 2339021 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6302416.5 ns 6260292 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204892 ns 204625 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 208562 ns 209992 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11517062.5 ns 11332208 ns 1.02
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8328812.5 ns 6550833 ns 1.27
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8342500 ns 8325250 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21059354.5 ns 20937125 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 734814.5 ns 734916 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1048679.5 ns 1048155.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5604.5 ns 4291 ns 1.31
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5875 ns 5875 ns 1
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6395.5 ns 6583 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4750 ns 4896 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 136624.5 ns 137991.5 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 6038921 ns 5581467 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 813000 ns 785625 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 56330 ns 56390 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9834 ns 7042 ns 1.40
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 11375 ns 10562.5 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10792 ns 7104.5 ns 1.52
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 7833 ns 0.90
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 751768 ns 754679 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 37322808 ns 34960226 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5368750 ns 5245042 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 366754 ns 371414 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 126417 ns 127625 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 101833 ns 95624.5 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 97167 ns 100000 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 135458.5 ns 95708 ns 1.42
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 149617 ns 152137 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6377317.5 ns 5871279.5 ns 1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2013729 ns 2635166.5 ns 0.76
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203027 ns 203242 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1956250 ns 2017959 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2025708 ns 2027771 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2023583 ns 2021167 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2023875 ns 1987167 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 699728 ns 703925.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32486459.5 ns 31965494 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11144687.5 ns 11055292 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1109856 ns 1255893 ns 0.88
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33708 ns 29375 ns 1.15
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36250 ns 34500 ns 1.05
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 35292 ns 35250 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 667 ns 583 ns 1.14
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15147 ns 15622 ns 0.97
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 78750 ns 80130 ns 0.98
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 3166 ns 2542 ns 1.25
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3292 ns 3125 ns 1.05
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3916 ns 2834 ns 1.38
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2125 ns 3000 ns 0.71
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 138043.5 ns 141408 ns 0.98
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 341483.5 ns 343344 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7125 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 5375 ns 1.14
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5959 ns 6000 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10083 ns 10209 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36390.5 ns 36671 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1443013 ns 1208337 ns 1.19
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 577687.5 ns 331459 ns 1.74
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48291 ns 48221 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217083 ns 217479 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 233729 ns 229625 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220875 ns 225000 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206583 ns 212875 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 241954 ns 244929 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28863209.5 ns 26091309.5 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8063584 ns 7984187.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 578495 ns 574266 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 3959 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3916 ns 3917 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21377 ns 21419 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2296242.5 ns 2118188.5 ns 1.08
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 246729.5 ns 234583 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42010 ns 42620 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14750 ns 14791 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15000 ns 14750 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14834 ns 14875 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14937.5 ns 14833 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 306378 ns 311492 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 12904688 ns 10906139 ns 1.18
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1048854 ns 982000 ns 1.07
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 192742 ns 192231.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 128750 ns 140834 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 128042 ns 127417 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 102500 ns 105167 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 128458 ns 141000 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133598 ns 152595 ns 0.88
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6098969 ns 6050834 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1992062.5 ns 2057334 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203872 ns 213297 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1913375 ns 1917833 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1918875.5 ns 1898875 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1920354.5 ns 1922083 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1922729.5 ns 1898854 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 684636 ns 692137 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31678268 ns 31139112 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10983583.5 ns 10436541 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1217291 ns 1217872 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19708 ns 18250 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18000 ns 18625 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21250 ns 20750 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17541 ns 17749.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 107089 ns 110137 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3668703 ns 3282416 ns 1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1366125 ns 480541.5 ns 2.84
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79431 ns 79421 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222250 ns 252041.5 ns 0.88
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227291.5 ns 217541.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221667 ns 219687.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217604.5 ns 222729.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 512942 ns 519298 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20906293 ns 20051825.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6227916.5 ns 6194812.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 478915 ns 478425 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 24625 ns 23291.5 ns 1.06
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 32084 ns 28583 ns 1.12
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 29583.5 ns 28792 ns 1.03
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1354 ns 1229.5 ns 1.10
batchedmm(16, Bsize=4)/forward/GPU/CUDA 15775 ns 16210 ns 0.97
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 87130 ns 82241 ns 1.06
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 5208 ns 4292 ns 1.21
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4937.5 ns 4729 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 6250 ns 5042 ns 1.24
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4208 ns 5771 ns 0.73
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 205104 ns 207444.5 ns 0.99
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 375704 ns 378084 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 305500 ns 305417 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305958 ns 306250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 308125 ns 308084 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 304792 ns 305750 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 224810.5 ns 228609 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8473173 ns 7545946 ns 1.12
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1064042 ns 604584 ns 1.76
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 272523 ns 273963 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 588083 ns 532917 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 540979 ns 538167 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 532187.5 ns 539125 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 530000 ns 572709 ns 0.93
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1066787 ns 1074383 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 49056863 ns 44755027.5 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6401167 ns 6115208.5 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 857918.5 ns 858603.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20292 ns 19291 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21021 ns 20708 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21584 ns 22375.5 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19459 ns 19875 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 111914.5 ns 114907 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3915484 ns 3614583 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1445124.5 ns 593916 ns 2.43
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79161 ns 79421 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 259584 ns 215708 ns 1.20
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218709 ns 220584 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213833 ns 213625 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221709 ns 215875 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 729277 ns 762395 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28351086 ns 25444001 ns 1.11
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7519125 ns 7232562.5 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 535735 ns 542290.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7542 ns 6125 ns 1.23
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6750 ns 7083 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7854 ns 7917 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6416 ns 6208 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 139596.5 ns 140165.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 6386332.5 ns 5168559 ns 1.24
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 812791.5 ns 799291 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 64971 ns 65270 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12937 ns 9542 ns 1.36
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9604 ns 10333.5 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10479 ns 10375 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9042 ns 11145.5 ns 0.81
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 821389 ns 826456 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 41212440 ns 37337383 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5394125 ns 5311708 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 376673 ns 387474 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5542 ns 4875 ns 1.14
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6041 ns 6917 ns 0.87
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5979 ns 7250 ns 0.82
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4125 ns 4812.5 ns 0.86
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 143159 ns 144262 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 6135330 ns 5426091.5 ns 1.13
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 841208 ns 808375 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 66410 ns 66621 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8125 ns 7458 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7625 ns 8083 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7333 ns 7541.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7458 ns 7833 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 779803.5 ns 783702 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 42489606.5 ns 37497088 ns 1.13
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5806041.5 ns 5566229 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 385114 ns 395004 ns 0.97
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14517875 ns 14350584 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10107833 ns 7693688 ns 1.31
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10123375 ns 10127042 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27737959 ns 27615959 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 529900 ns 548306 ns 0.97
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 392854 ns 393134 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46502041.5 ns 45943208 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33504375 ns 26437417 ns 1.27
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33527167 ns 33454833 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85258875 ns 84782667 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2630210 ns 2657066 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3305402 ns 3290613 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 68083 ns 66375 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 66021 ns 68584 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69042 ns 69333.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66875 ns 65979 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 120187.5 ns 121920.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3913619.5 ns 3593431.5 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1439458.5 ns 508166 ns 2.83
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 224532 ns 229397.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 502375 ns 446833 ns 1.12
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 452542 ns 452437.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 441146 ns 446375 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 444833 ns 445834 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 732944 ns 728139 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 29462542.5 ns 26912797 ns 1.09
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7794083 ns 7552104 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 779447 ns 790108 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 666 ns 0.88
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 667 ns 0.75
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 33084 ns 32311 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1348590.5 ns 1198752.5 ns 1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 458416.5 ns 473500 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 47291 ns 47340 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9209 ns 8666 ns 1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9500 ns 9208 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 8666 ns 8458 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9167 ns 17104 ns 0.54
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 289186 ns 286358 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 24166950 ns 20778583 ns 1.16
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5210708.5 ns 4681395.5 ns 1.11
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 381324 ns 375004 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9792 ns 9875 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9834 ns 9875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9792 ns 9792 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9792 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23519 ns 23012 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2258808.5 ns 2014844 ns 1.12
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 221041.5 ns 215645.5 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 207272 ns 205762 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45959 ns 45958 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45959 ns 46042 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46041 ns 46041 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 46375 ns 46250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 292709.5 ns 290878 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 13279604 ns 9152947 ns 1.45
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 963562.5 ns 942542 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 601736 ns 607695 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56834 ns 56250 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57208 ns 56458 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57000 ns 57083 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57791 ns 57709 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28797 ns 28552 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1296667 ns 1253508.5 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 599375 ns 663666.5 ns 0.90
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 214467.5 ns 203541.5 ns 1.05
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 488583 ns 448583 ns 1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 506875 ns 465562 ns 1.09
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 467854 ns 465458.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 444854 ns 454041.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 247966 ns 245887 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 35422277.5 ns 33424426 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9625250 ns 9545520.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 889783 ns 887779 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 662791 ns 645812.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645583 ns 575959 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 641458 ns 640542 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 654708 ns 646271 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 204631.5 ns 208584 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 9404311.5 ns 8406939 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1366041 ns 1406395.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 307612.5 ns 315503 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2256146 ns 2214979 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2230917 ns 2211999.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2237292 ns 2220812.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2235916 ns 2227958 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 983378 ns 978439 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 51532010 ns 47363900 ns 1.09
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7223667 ns 10481646 ns 0.69
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1360743 ns 1213952 ns 1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21208 ns 18625 ns 1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21895.5 ns 20729 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24000 ns 21583 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18708 ns 18875 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 113606 ns 113850.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 4029922 ns 3565557.5 ns 1.13
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1470375 ns 497958 ns 2.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81911 ns 79731 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 263833 ns 227375 ns 1.16
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 230917 ns 259417 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221375 ns 225541 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 261833.5 ns 227084 ns 1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 732293.5 ns 729838 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 28666996 ns 26163617 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7932292 ns 7560500 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 557920 ns 554315 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 584 ns 584 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 583 ns 541 ns 1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23564 ns 23274 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1402930.5 ns 1191789 ns 1.18
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 479854.5 ns 484250 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 49551 ns 48040 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10042 ns 9083 ns 1.11
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9833 ns 10437.5 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9208 ns 9541 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 8625 ns 9500 ns 0.91
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 271175.5 ns 268183 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 27354439 ns 24685731.5 ns 1.11
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5706584 ns 5000875 ns 1.14
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 399053 ns 398234 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9709 ns 7250 ns 1.34
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9104 ns 9187.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9437.5 ns 9645.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8375 ns 8041 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 122324.5 ns 118921.5 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3848922 ns 3382327 ns 1.14
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 890083 ns 886791.5 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 69951 ns 71801 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7417 ns 7604 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7500 ns 8125 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7500 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7333 ns 7562.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 514534 ns 507494 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 19594222 ns 17189656.5 ns 1.14
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4165479 ns 3782375 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 320028 ns 320313 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1562.5 ns 1500 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1708.5 ns 1708.5 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833.5 ns 1791 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1333 ns 1375 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 21964 ns 21598 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1238732.5 ns 1189888 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 302542 ns 313375 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 188582 ns 190932 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3333 ns 3541 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3458 ns 3583 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3334 ns 3458 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3250 ns 3292 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 224397.5 ns 218452 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10897600 ns 9603283 ns 1.13
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1688875 ns 1797375 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 578505.5 ns 583116 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148875 ns 148104.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 132708 ns 106833 ns 1.24
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 130750 ns 128562.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225250 ns 225000 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 24103 ns 23975 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1297180 ns 1165725 ns 1.11
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 269833 ns 254292 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 40231 ns 41470 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 162604 ns 157645.5 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 127166 ns 87625 ns 1.45
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 112750 ns 112000 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 265229 ns 250708.5 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 219287 ns 218220.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 11195277.5 ns 10460438 ns 1.07
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 1990375 ns 1096666 ns 1.81
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 267987.5 ns 269773 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7375 ns 7167 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5959 ns 5333 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6000 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10209 ns 10458 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33200 ns 32755 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1323539 ns 1178842 ns 1.12
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 615604 ns 330458 ns 1.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50040 ns 50720 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 260750 ns 253104 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 234833 ns 229041.5 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 265125 ns 234187.5 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221333 ns 227938 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 264591 ns 263186.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 29454390 ns 27448206 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8466083 ns 8237750 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 592630 ns 594190.5 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 15750 ns 13792 ns 1.14
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15667 ns 15166 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 16167 ns 16499.5 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 14541 ns 14667 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 140225 ns 139540 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 6115964 ns 5436668.5 ns 1.12
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 798333 ns 786729 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 232492 ns 232963 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23708 ns 23000 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23479 ns 23937.5 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23562.5 ns 23875 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 22667 ns 23979.5 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 872247 ns 870094.5 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 42738683 ns 40010466.5 ns 1.07
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5646770.5 ns 5595708 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 676987 ns 679366 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 10041 ns 8750 ns 1.15
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10187.5 ns 10312.5 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11666 ns 11271 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8792 ns 9584 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 125357.5 ns 123388.5 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3857738.5 ns 3563169 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 898625 ns 858292 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 75221 ns 74460 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14000 ns 13375 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13812.5 ns 14458.5 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14062.5 ns 13958 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14292 ns 13625 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 675390 ns 667308 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 23526980.5 ns 21257602 ns 1.11
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5359958.5 ns 4997708 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 365113 ns 365743 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10292 ns 8583 ns 1.20
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9646 ns 10333 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10958 ns 10312.5 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 8542 ns 9166 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 124246 ns 121770.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3650341 ns 3365145.5 ns 1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 890042 ns 906625 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 72050 ns 75170 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13084 ns 12292 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12896 ns 13437.5 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12542 ns 12916 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12667 ns 12458 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 557269 ns 553718.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 20940364 ns 18868109 ns 1.11
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4415208 ns 3865125.5 ns 1.14
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 341913.5 ns 341293 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 30438 ns 26354.5 ns 1.15
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 32771 ns 30645.5 ns 1.07
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 32145.5 ns 31541 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1875 ns 1833 ns 1.02
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16382 ns 16183 ns 1.01
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 80651 ns 81001 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5375 ns 5209 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 4937 ns 5021 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5208 ns 5417 ns 0.96
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6292 ns 6604 ns 0.95
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 141456.5 ns 140577.5 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 382544 ns 370423.5 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 250 ns 291 ns 0.86
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 26188 ns 25697 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1349689 ns 1197018 ns 1.13
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 455771 ns 465667 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 48850 ns 47180 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6583 ns 6125 ns 1.07
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6375 ns 6729 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6250 ns 6333 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6250 ns 6312.5 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 190177 ns 187721.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 25715880 ns 23736279.5 ns 1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5628084 ns 4952833.5 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388664 ns 386429 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2042 ns 1959 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2042 ns 2042 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2125 ns 2000 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 1958 ns 1959 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 26944 ns 26463 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1363088 ns 1170027.5 ns 1.17
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 471437.5 ns 479625 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 205032 ns 206252 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16958 ns 16250 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16250 ns 16666 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16749.5 ns 16208.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16250 ns 16417 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 278717.5 ns 276067 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 26543319 ns 24921263 ns 1.07
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6143666 ns 5326083 ns 1.15
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 701356 ns 700836 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 193791 ns 173875 ns 1.11
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 174166.5 ns 148750 ns 1.17
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151875 ns 155708 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 161458 ns 147458 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 200117.5 ns 203847 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8677326 ns 8347024.5 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1431250 ns 1561917 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 224822 ns 232482 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1332708 ns 1328917 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1313042 ns 1311771 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1321250 ns 1320791 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1320542 ns 1322500 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 914262.5 ns 909940.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52072722 ns 44667022 ns 1.17
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6865145.5 ns 7124333 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1099471 ns 995559.5 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25270.5 ns 22958 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25750 ns 26833 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28167 ns 27625 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24645.5 ns 24667 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 236681 ns 234608.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8520645.5 ns 7924652 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 960167 ns 576541 ns 1.67
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 114711 ns 116011 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 128833.5 ns 118166.5 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 184437.5 ns 122375 ns 1.51
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 126541.5 ns 158041.5 ns 0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 117313 ns 123833.5 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1084581 ns 1073695 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 48584064.5 ns 44153968 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6244708 ns 6127166 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 609766 ns 612925 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 334 ns 250 ns 1.34
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 250 ns 250 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23179.5 ns 23160 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1352649.5 ns 1212472 ns 1.12
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 470375 ns 478542 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 47251 ns 47471 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6875 ns 6291 ns 1.09
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6667 ns 6833.5 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6250 ns 6458 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6604 ns 6584 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 206812 ns 204382.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 26430531.5 ns 24496787 ns 1.08
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5939666 ns 5334937.5 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 393154 ns 388703 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6750 ns 5208 ns 1.30
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6416.5 ns 7021 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7042 ns 7458 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6750 ns 5667 ns 1.19
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 147041 ns 145933.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 6204224 ns 5745568 ns 1.08
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 711062.5 ns 753959 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 232702 ns 234802 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10250 ns 9583 ns 1.07
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9875 ns 10375 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 10125 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9792 ns 10042 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 908474 ns 903827 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 42229280 ns 42297357 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 6135833 ns 5826479 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 665637 ns 668457 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 709 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 625 ns 1.07
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22806 ns 22371 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2183221 ns 2015786 ns 1.08
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 228667 ns 208416 ns 1.10
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 206602 ns 207552 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4625 ns 4584 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4666 ns 4833 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4625 ns 4666 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4584 ns 4584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 229835 ns 228749 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10794904 ns 10461831 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1685770.5 ns 1654416.5 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 577495 ns 580735 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 9042 ns 7750 ns 1.17
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9083.5 ns 9166.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9354 ns 8834 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7834 ns 8291 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 124219 ns 121959 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3899985 ns 3411255 ns 1.14
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 810375 ns 827916 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 74040.5 ns 74011 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 8625 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8291 ns 9041.5 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8750 ns 8583.5 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8375 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 596441 ns 591884.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 23179785 ns 20708574.5 ns 1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4819896 ns 4264875 ns 1.13
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 338953 ns 342784 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 127000 ns 122750 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 131000 ns 96459 ns 1.36
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 129584 ns 130187.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 180958.5 ns 180875 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46329 ns 45830 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 104561 ns 101721 ns 1.03
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 341167 ns 328000 ns 1.04
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 333583 ns 166666 ns 2.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 325333 ns 347541.5 ns 0.94
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 588354 ns 608646 ns 0.97
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 194256.5 ns 192063 ns 1.01
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 512055 ns 505519.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399208 ns 395916 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288166.5 ns 214250 ns 1.35
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287875 ns 288167 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755750 ns 756500 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43515 ns 43676.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1420150 ns 1411321 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 420292 ns 429792 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 81701 ns 82131 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1396437 ns 1458834 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1134500 ns 857583 ns 1.32
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1133416.5 ns 1134333 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2443791.5 ns 2441958.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 250930 ns 249859 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 12447603 ns 10370982 ns 1.20
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1797500 ns 1909646 ns 0.94
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 352383.5 ns 352903 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 658917 ns 616500 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 647083.5 ns 598250 ns 1.08
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 625729 ns 648916.5 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 629562.5 ns 642667 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 202467 ns 200586.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 9193261 ns 7794534 ns 1.18
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1344749.5 ns 1363291 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 311273 ns 313733 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2486625 ns 2445375 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2447229 ns 2426917 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2446229 ns 2441500 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2455167 ns 2440750 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 999287 ns 994961 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 61254580 ns 50766350 ns 1.21
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10164208 ns 9661291 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1302412 ns 1307388 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33437.5 ns 28521 ns 1.17
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35145.5 ns 34625 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 33896 ns 33916.5 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 875 ns 875 ns 1
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15909 ns 15425.5 ns 1.03
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 84991 ns 79381 ns 1.07
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3250 ns 3062.5 ns 1.06
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3083.5 ns 3416 ns 0.90
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3333 ns 3208 ns 1.04
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3041 ns 3209 ns 0.95
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 139820.5 ns 139741 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 335653 ns 338953 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 409291 ns 404500 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 408167 ns 402125 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 408916 ns 408334 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 420042 ns 422458 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 43861 ns 43145 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1610692 ns 1417291 ns 1.14
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1146937.5 ns 1128750.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 241802 ns 239562 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3890500 ns 3863292 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3991792 ns 3971625 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3995938 ns 3996791 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3777541.5 ns 3757979.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 245384 ns 242826 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 40053105 ns 38623864 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11890208 ns 11673750 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1427303 ns 1433229 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3916 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33956 ns 33968 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1415999 ns 1232483 ns 1.15
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 180646 ns 167334 ns 1.08
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 39530 ns 38620 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15583 ns 15666 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15708 ns 15750 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15708 ns 15625 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15625 ns 15625 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 256980 ns 255128 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 9741901 ns 8717525 ns 1.12
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 867771 ns 843520.5 ns 1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 177356.5 ns 169816.5 ns 1.04
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 403959 ns 402625 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295875 ns 220209 ns 1.34
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295292 ns 295959 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760750 ns 760791.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113403.5 ns 113239 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1056307 ns 1047524 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 458041 ns 348895.5 ns 1.31
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 89041 ns 89300.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1445458 ns 1474958.5 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1158000 ns 881146 ns 1.31
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1156604 ns 1159083.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2464729.5 ns 2461917 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 241604 ns 241292 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 12919628 ns 9318727.5 ns 1.39
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1936541.5 ns 1946459 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 353843 ns 354883 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 584 ns 542 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 26174 ns 25844 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1343237.5 ns 1200537.5 ns 1.12
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 430334 ns 496709 ns 0.87
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 209062 ns 209382 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7875 ns 7375 ns 1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7708 ns 8104.5 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7500 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7250 ns 7375 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 214822.5 ns 217033.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 28436000 ns 25754399 ns 1.10
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5825750 ns 5254333.5 ns 1.11
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 684816 ns 685977 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 836604 ns 825125.5 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 618875 ns 468584 ns 1.32
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 620167 ns 621500 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1552792 ns 1536542 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130046 ns 130845.5 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 229912 ns 229862 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2694187.5 ns 2661979 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2000104.5 ns 1535250.5 ns 1.30
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1999042 ns 2000792 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4936792 ns 4906416 ns 1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 251857 ns 242304 ns 1.04
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 837543 ns 841449 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 291 ns 250 ns 1.16
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 291 ns 291 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32688 ns 32216 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1331487 ns 1218492 ns 1.09
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 447625 ns 464375 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 46711 ns 47630 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6666 ns 6125 ns 1.09
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6458 ns 6708 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6208 ns 6500 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6417 ns 6375 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 232857 ns 224154.5 ns 1.04
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 24854567 ns 21407773 ns 1.16
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5311167 ns 4615291 ns 1.15
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 359813.5 ns 357793.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2405750 ns 2392708 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2416666 ns 2371959 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2377375 ns 2404416 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2392666 ns 2370084 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 201638 ns 200035.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8402298 ns 7868335 ns 1.07
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1416500 ns 1597041.5 ns 0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 372683.5 ns 373933 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4654167 ns 4648292 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4665479 ns 4644250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4644229.5 ns 4636708 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4648583 ns 4642750 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 902404.5 ns 891890 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52065462 ns 46027858 ns 1.13
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6861875 ns 6938541.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1391004 ns 1391633 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6708.5 ns 7187.5 ns 0.93
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7208 ns 7542 ns 0.96
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7645.5 ns 7125 ns 1.07
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 13396 ns 6875 ns 1.95
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 23661 ns 23289 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1330674.5 ns 1167669 ns 1.14
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 266208 ns 243458.5 ns 1.09
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 39961 ns 39800 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 51604 ns 46396.5 ns 1.11
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 49000 ns 32917 ns 1.49
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 45750 ns 45875.5 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 45375 ns 67312 ns 0.67
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 218958 ns 214725 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 11575244 ns 10485830 ns 1.10
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2067250 ns 1121562 ns 1.84
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 264843 ns 269102.5 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21396 ns 19604.5 ns 1.09
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25667 ns 24021 ns 1.07
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 24249.5 ns 23750 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 7375 ns 5084 ns 1.45
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17124 ns 17227 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 84151 ns 83741 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12229 ns 11916 ns 1.03
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10687 ns 9354.5 ns 1.14
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10229 ns 10417 ns 0.98
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17792 ns 17958 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 229557 ns 225890 ns 1.02
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 371578.5 ns 371753 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406750 ns 404000 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297125 ns 222584 ns 1.33
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 296834 ns 296875 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762417 ns 762667 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46955 ns 46288 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1453711 ns 1401617.5 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 484187.5 ns 358375 ns 1.35
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 88881 ns 89491 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1431645.5 ns 1480896 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1166209 ns 888250 ns 1.31
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1164750 ns 1164959 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2471229 ns 2465417 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 294082.5 ns 288016 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 12353848 ns 12678894 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2093020.5 ns 2117375 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 380814 ns 381744 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 434500 ns 432125 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 437125 ns 430333 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 437250 ns 436917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 447542 ns 448604.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54894 ns 54122.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1083139 ns 1002212 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1087416 ns 1059021 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 233642 ns 234952 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3902292 ns 3895042 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4012625 ns 4004458 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4016541 ns 4030291.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3808250 ns 3789979 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 266487.5 ns 260055 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35233900 ns 30675954 ns 1.15
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10616978.5 ns 10349458.5 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1364063 ns 1223712 ns 1.11
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8750 ns 8750 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 7667 ns 6917 ns 1.11
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7667 ns 7583 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12375 ns 12416 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24395 ns 23553.5 ns 1.04
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2388137.5 ns 2134096 ns 1.12
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 229041 ns 214667 ns 1.07
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 209122 ns 211142 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 44875 ns 44958 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45000 ns 45083 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45000 ns 45000 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45292 ns 44958 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 350021 ns 344550 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 14581645 ns 14001329.5 ns 1.04
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1777208 ns 1862458 ns 0.95
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 655627 ns 659011.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 124000 ns 122729 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 96270.5 ns 83521 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86562.5 ns 87354.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 86958.5 ns 105375 ns 0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 189446 ns 190055 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6078785 ns 5969481 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1983729 ns 1972791.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 221122 ns 214447 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2025375 ns 2012458.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2011792 ns 1980000 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2010229 ns 2023917 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2013666.5 ns 2011645.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 536819 ns 529776 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 29198754 ns 29142428 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9376375 ns 9305500.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 967839 ns 1088680 ns 0.89

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.