Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: relax cublaslt types (#173)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Oct 10, 2024
1 parent 2d7533c commit 301b59c
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.2"
version = "1.3.3"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
3 changes: 1 addition & 2 deletions ext/LuxLibCUDAExt/LuxLibCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module LuxLibCUDAExt

# This file only wraps functionality part of CUDA like CUBLAS
using CUDA: CUDA, CUBLAS, StridedCuMatrix, StridedCuVector, CuPtr, AnyCuMatrix, AnyCuVector
using CUDA: CUDA, CUBLAS, StridedCuMatrix, StridedCuVector, CuPtr
using LinearAlgebra: LinearAlgebra, Transpose, Adjoint
using LuxLib: LuxLib, Optional
using LuxLib.Utils: ofeltype_array
Expand Down
12 changes: 6 additions & 6 deletions ext/LuxLibCUDAExt/cublaslt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -170,16 +170,16 @@ end
len(x) = length(x)
len(::Nothing) = nothing

function LuxLib.Impl.cublasLt_fused_dense(act::F, weight::AnyCuMatrix, x::AnyCuMatrix,
b::Optional{<:AnyCuVector}, ::False) where {F}
function LuxLib.Impl.cublasLt_fused_dense(act::F, weight::AbstractMatrix,
x::AbstractMatrix, b::Optional{<:AbstractVector}, ::False) where {F}
z = similar(x, LuxLib.concrete_fba_output_eltype(act, weight, x, b),
size(weight, 1), size(x, 2))
LuxLib.cublasLt_fused_dense!(z, act, weight, x, b)
return z, nothing
end

function LuxLib.Impl.cublasLt_fused_dense(act::F, weight::AnyCuMatrix, x::AnyCuMatrix,
b::Optional{<:AnyCuVector}, ::True) where {F}
function LuxLib.Impl.cublasLt_fused_dense(act::F, weight::AbstractMatrix,
x::AbstractMatrix, b::Optional{<:AbstractVector}, ::True) where {F}
z = similar(x, LuxLib.concrete_fba_output_eltype(act, weight, x, b),
size(weight, 1), size(x, 2))
y = similar(z)
Expand All @@ -188,8 +188,8 @@ function LuxLib.Impl.cublasLt_fused_dense(act::F, weight::AnyCuMatrix, x::AnyCuM
end

function LuxLib.Impl.cublasLt_fused_dense!(
z::AbstractMatrix, act::F, weight::AnyCuMatrix, x::AnyCuMatrix,
b::Optional{<:AnyCuVector}, y::Optional{<:AbstractMatrix}=nothing) where {F}
z::AbstractMatrix, act::F, weight::AbstractMatrix, x::AbstractMatrix,
b::Optional{<:AbstractVector}, y::Optional{<:AbstractMatrix}=nothing) where {F}
if hasmethod(cublaslt_matmul_fused!,
(typeof(z), typeof(act), typeof(weight), typeof(x), typeof(b), typeof(y)))
retcode = cublaslt_matmul_fused!(z, act, weight, x, b, y)
Expand Down

3 comments on commit 301b59c

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/117022

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.3 -m "<description of version>" 301b59c2e7c7bc79d8db585c77778772aa52b284
git push origin v1.3.3

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 301b59c Previous: 2d7533c Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5125 ns 5209 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6937.5 ns 5208 ns 1.33
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7417 ns 7291 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6083 ns 6208 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 104885 ns 115729 ns 0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2678307 ns 2692776 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 401685 ns 408504 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9917 ns 10083 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10042 ns 10208 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10750 ns 10375 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9729 ns 9833 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 495998 ns 496762 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18744208 ns 17703724 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 680377 ns 10961843 ns 0.06206775630703706
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1458 ns 1312 ns 1.11
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1541.5 ns 1500 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1750 ns 1875 ns 0.93
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3187.5 ns 1479.5 ns 2.15
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 20316 ns 20353.5 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1305124 ns 1346068.5 ns 0.97
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 31190.5 ns 31961 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4334 ns 4000 ns 1.08
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4041 ns 4416 ns 0.92
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4083 ns 4500 ns 0.91
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4354 ns 4333 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 134077 ns 133606 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 8979794 ns 9495102 ns 0.95
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 148416.5 ns 147546.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57500 ns 57500 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46667 ns 46333 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39917 ns 39750 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83500 ns 82562.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37564 ns 36967.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 567840.5 ns 548600 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 80616 ns 80581 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2038666 ns 2024000 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2081166 ns 2088104 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2084042 ns 2081875 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1991875 ns 1983520.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 223666 ns 218972 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7677352 ns 7891968 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1187113 ns 973560 ns 1.22
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 146541 ns 145834 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 148041.5 ns 172583 ns 0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151625 ns 151875.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 176750 ns 176250 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166355.5 ns 167986 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7478548 ns 7801350.5 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 190117 ns 197777 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1106833.5 ns 1108729.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1109708 ns 1105292 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1125750 ns 1119062.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1112687.5 ns 1108749.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 654461 ns 642887 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33783553 ns 33405409 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1021271 ns 1027070 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5333 ns 6083 ns 0.88
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5125 ns 4937.5 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5750 ns 5896 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5084 ns 5750 ns 0.88
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 83746 ns 83848 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5563998.5 ns 5356951.5 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 61491 ns 69841 ns 0.88
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8792 ns 9000 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8625 ns 9042 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9250 ns 9042 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8417 ns 8542 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 559136 ns 556012 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 34995936.5 ns 37949872 ns 0.92
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 392504 ns 395964 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17083 ns 18791 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18000 ns 16875 ns 1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18791.5 ns 20917 ns 0.90
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17708.5 ns 22791.5 ns 0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 63135.5 ns 61826 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3027434.5 ns 3296125 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 74881 ns 76391 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218791 ns 211083 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212063 ns 218583.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213375 ns 221999.5 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218250 ns 211500 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 334874 ns 328054 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 15538427 ns 14617604.5 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 465885 ns 468680 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 750 ns 0.83
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 708 ns 666.5 ns 1.06
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 792 ns 917 ns 0.86
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 625 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 19376 ns 19270 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1181689 ns 1164614.5 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 30801 ns 31200 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1375 ns 1459 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1417 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1500 ns 1500 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1375 ns 1459 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 115818 ns 115345.5 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8578264 ns 8786881.5 ns 0.98
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 125221.5 ns 136362 ns 0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7333 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 5958 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5458 ns 5458 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10375 ns 10167 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24404 ns 23777 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1185331.5 ns 1195053 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 47150 ns 49421 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 259875 ns 228791 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 239750 ns 262833 ns 0.91
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 238375 ns 244208 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212937.5 ns 227438 ns 0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 194467.5 ns 188310 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 30488731 ns 30683195 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 603521 ns 646667 ns 0.93
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 4125 ns 0.96
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 3916 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4084 ns 4083 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23361 ns 23548.5 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 1914869 ns 2046712.5 ns 0.94
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 47581 ns 49050 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16958 ns 16750 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16875 ns 16833 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16667 ns 16833 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16750 ns 17000 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 186194.5 ns 184716.5 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 9861733 ns 10810606 ns 0.91
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 172361.5 ns 178062 ns 0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 490917 ns 491291 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 385541 ns 385708 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 313292 ns 313250 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 846958.5 ns 846667 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113486.5 ns 113504.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 398692.5 ns 400320 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 245177.5 ns 243402 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2139937 ns 2157041.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1863583 ns 1860000 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1584583.5 ns 1596917 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3114083 ns 3118291.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 229713.5 ns 228877.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11955773.5 ns 9523997.5 ns 1.26
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 745073 ns 743298 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7104 ns 6541.5 ns 1.09
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6792 ns 6167 ns 1.10
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7083 ns 7145.5 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6229.5 ns 6416 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 83179 ns 82766.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 6726845 ns 5786455 ns 1.16
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 59261 ns 67260 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10250 ns 11708.5 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11458 ns 10333 ns 1.11
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11895.5 ns 12417 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11166.5 ns 10375 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 592614 ns 599572.5 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 37936205.5 ns 36065836.5 ns 1.05
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 410389 ns 415124 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23257 ns 23681.5 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2214765 ns 2157030 ns 1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 48421 ns 49180 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2167 ns 2125 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2208 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 230148 ns 230420 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 11848732 ns 10946869 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 178962 ns 182202 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8917 ns 9208 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8917 ns 8666.5 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10083 ns 9917 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9208 ns 8792 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 99883.5 ns 100396.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3281834 ns 3318002.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 73811 ns 75271 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17438 ns 17229.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17125 ns 18479.5 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 19125 ns 18625 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17375 ns 18000 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 574862.5 ns 575393.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 17368412 ns 16729549.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 382279 ns 385864 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34631 ns 34044 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1211808 ns 1236371 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 48701 ns 48691 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9146 ns 9625.5 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9521 ns 9541.5 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10229.5 ns 9709 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8604 ns 8833.5 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 260130.5 ns 254859 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19439989.5 ns 19246352.5 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 367164 ns 375034 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 396854.5 ns 397208 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288229.5 ns 287667 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215042 ns 215291 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755958 ns 755625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112250 ns 112458 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 328996 ns 340204 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 75451 ns 76851 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1462500 ns 1468271 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136041 ns 1130458 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 860334 ns 858125 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2439875 ns 2440187.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 199853.5 ns 199457 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 9985334 ns 9886202 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 324698 ns 322043 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7000 ns 8021.5 ns 0.87
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7687.5 ns 7875 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8375 ns 8750 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7499.5 ns 7125 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 138856 ns 134916.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 6055720 ns 5780710 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 60111 ns 70255.5 ns 0.86
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15874.5 ns 16917 ns 0.94
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16271 ns 15042 ns 1.08
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15792 ns 15979 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13125.5 ns 16000 ns 0.82
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 911828 ns 878404 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42608795.5 ns 41935612.5 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 429664 ns 433994 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24000 ns 28792 ns 0.83
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25958 ns 25792 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 26833.5 ns 28833.5 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24937.5 ns 30354.5 ns 0.82
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 189463 ns 183000.5 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7536335 ns 7959277.5 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 112782 ns 115401 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 146084 ns 112375 ns 1.30
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 152541.5 ns 144438 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 105833 ns 105854.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 153500 ns 150875 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1027043 ns 977911 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 41813684 ns 41813067 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 587426 ns 589736 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74042 ns 74166 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 84500 ns 74604 ns 1.13
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74917 ns 77333 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74333 ns 76334 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 195104 ns 189045 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7388961 ns 7503392 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 121551 ns 128881 ns 0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 281250 ns 295667 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 290833 ns 307166 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 244667 ns 300000 ns 0.82
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 297125 ns 276875.5 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1044893 ns 986480 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 40287331 ns 40933470 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 693978 ns 697017.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12583.5 ns 13166.5 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13333.5 ns 13229 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14000 ns 14833.5 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13125 ns 13667 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 137568.5 ns 133538.5 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5655781 ns 5773755.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 235892 ns 236113 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27458 ns 27000 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 28437 ns 27500 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27583 ns 27187.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 25396 ns 27438 ns 0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 925629.5 ns 917467.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42183215 ns 39999839 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 696807 ns 698258 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 10583.5 ns 11209 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11729 ns 11292 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14020.5 ns 13375 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11166 ns 11083 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 119207 ns 119722.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3459447 ns 3349179 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 241797.5 ns 240142 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22228.5 ns 23333 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22979 ns 23084 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 24041 ns 24000 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 22958 ns 21958 ns 1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 679984 ns 678230.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 21093495.5 ns 22343314.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 675492 ns 678857 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 65145.5 ns 65021 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 69062 ns 62875 ns 1.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 67375 ns 68667 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 63250 ns 66417 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102654.5 ns 101393 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3365331 ns 3400903 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 244962 ns 236963 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 512250 ns 477895.5 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 511875 ns 476959 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 467958.5 ns 468750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 464791 ns 495833 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 497974 ns 488817 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 19959026 ns 20464230 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 716037 ns 715823 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7458 ns 7146 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7479.5 ns 8375 ns 0.89
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8791.5 ns 8500 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7000 ns 7021 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 136611.5 ns 136539.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5668588 ns 5535345 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 59181 ns 69291 ns 0.85
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16084 ns 11458 ns 1.40
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16104 ns 14500 ns 1.11
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15145.5 ns 16125 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15292 ns 13416 ns 1.14
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 892529 ns 886518 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 37494483 ns 37792827 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 399300 ns 407319.5 ns 0.98
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6148250 ns 6154209 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6373958.5 ns 6370021 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 3229667 ns 3225542 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11910625 ns 11912875 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 348836 ns 345647 ns 1.01
batchedmm(512, Bsize=4)/forward/GPU/oneAPI 48313142 ns 49342806 ns 0.98
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 303493 ns 305758 ns 0.99
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19111312.5 ns 19108188 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19956500 ns 19939624.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 11118833 ns 11149250 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36495125 ns 36445875 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1010983 ns 1059965 ns 0.95
batchedmm(512, Bsize=4)/zygote/GPU/oneAPI 77165819 ns 79558988 ns 0.97
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1185177 ns 1166672 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1000 ns 1000 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1042 ns 1000 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1042 ns 1083 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 959 ns 959 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23306 ns 23689 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2102392 ns 2151476.5 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 210582 ns 209622 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3958 ns 3917 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3959 ns 4041 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4041 ns 4000 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3958 ns 3916 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 274898 ns 274634 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10835037 ns 10742838 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 633051.5 ns 625596 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7667 ns 7292 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8271 ns 9000 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10167 ns 10250 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7999.5 ns 9062.5 ns 0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 116562 ns 116615 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3281805 ns 3546009 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 68781 ns 69341 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11833 ns 12000 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12271 ns 12667 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 13500 ns 12437.5 ns 1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12209 ns 12417 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 610392 ns 605595 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 20835527 ns 22519876 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 356904 ns 363803 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22489 ns 22597.5 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2031329 ns 2178291 ns 0.93
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 49170 ns 48315.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3042 ns 2834 ns 1.07
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2958 ns 2916 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3209 ns 3167 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2834 ns 3083 ns 0.92
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 196092.5 ns 194557 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9721843.5 ns 9614403 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 166151.5 ns 170192 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11542 ns 11333 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12584 ns 11459 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13333.5 ns 13708 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10959 ns 12333 ns 0.89
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 115616.5 ns 115903.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3435294.5 ns 3311083 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 238972 ns 239372.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 22250 ns 20792 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 22583.5 ns 23500 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22875 ns 22395.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23104.5 ns 21458.5 ns 1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 561561 ns 558538 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19972002 ns 19541146 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 652206 ns 657037 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4167 ns 4375 ns 0.95
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4375 ns 4167 ns 1.05
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4458 ns 4417 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24400 ns 24750 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2160362 ns 2038545 ns 1.06
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 49090 ns 49870 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16167 ns 16708 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16625 ns 16167 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16291 ns 16500 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16584 ns 16667 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 320232 ns 317514 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12103289.5 ns 12292699 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 205902 ns 212047.5 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2084 ns 2083 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2209 ns 2125 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2167 ns 2125 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2125 ns 2083 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35395 ns 35083 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1121264.5 ns 1184726 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 218222 ns 206953 ns 1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 17896 ns 17250 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 17916 ns 18667 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 19125 ns 19584 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 18146 ns 20125 ns 0.90
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 286121 ns 284678 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20551833.5 ns 20274746 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 685457 ns 691617 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 60208.5 ns 60292 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 65458 ns 66792 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 60938 ns 62000 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 53875 ns 51125 ns 1.05
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66633 ns 66448 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/oneAPI 86298273 ns 87696389 ns 0.98
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 102431 ns 117412 ns 0.87
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 197791.5 ns 198916 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 162042 ns 167229 ns 0.97
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 137250 ns 141417 ns 0.97
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 295208 ns 300125 ns 0.98
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 211289 ns 209004 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/oneAPI 152039178 ns 147263909.5 ns 1.03
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 510905 ns 620696.5 ns 0.82
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 123834 ns 82583 ns 1.50
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 123125 ns 140250 ns 0.88
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84312.5 ns 86417 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 90875 ns 116583 ns 0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193182.5 ns 191982.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5322780 ns 5863118 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 192502 ns 203942 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921875 ns 1921771 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1909416 ns 1908917 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1888250 ns 1919708 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1881750 ns 1924521 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 510619 ns 504208.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 26283882 ns 26294676.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 911709 ns 1070976 ns 0.85
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21603 ns 21855 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2089663 ns 2006228 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 42021 ns 41700 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 246530.5 ns 242053 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 9718939 ns 10350039 ns 0.94
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 183711 ns 183192 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8083 ns 9833 ns 0.82
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9791 ns 9833 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 12125 ns 11709 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8458 ns 10583 ns 0.80
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 115667.5 ns 116639.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3479265.5 ns 3403003.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 238712 ns 238567.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10334 ns 8875 ns 1.16
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10375 ns 10875 ns 0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10709 ns 10125 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10750 ns 9500 ns 1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 493762.5 ns 488952 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19419012 ns 20132943 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 631376 ns 630866 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58459 ns 57875 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46541 ns 46958 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39791 ns 39625 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82958 ns 82250 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 39195 ns 38551 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1326636 ns 1316937 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 77861 ns 79411 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1927333.5 ns 1922646 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1977312 ns 1979292 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1955167 ns 1942292 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1892417 ns 1900917 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 217765.5 ns 210456 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33483865 ns 33978774 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1004015.5 ns 1015680 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267875 ns 267333 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 277417 ns 269625 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 270958 ns 270729.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 278250 ns 269645.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 198525 ns 192987.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7684906 ns 7844239 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 283563 ns 285143 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 614937.5 ns 698604 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 658104 ns 671916.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 590146 ns 667416 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 646750.5 ns 626771 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1004951 ns 985897 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44721716 ns 45574369 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 899859 ns 913670 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2206250 ns 2218667 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2176625 ns 2215687 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2107416 ns 2220312.5 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2210708 ns 2213250 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 158799 ns 157769 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8305150 ns 8237698 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 412934 ns 425304 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5495166.5 ns 5486562 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5498084 ns 5529917 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5497292 ns 5524333.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5479145.5 ns 5488625 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 942447 ns 927722 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52379643 ns 53249072 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1717957 ns 1555466 ns 1.10
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 476375 ns 478042 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 344833 ns 346167 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 255667 ns 257167 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 909083 ns 909250 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46257.5 ns 46497 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 876632 ns 825183 ns 1.06
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 245143 ns 245473 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2148125 ns 2167292 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1855417 ns 1862208 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1588042 ns 1591771 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3122292 ns 3122542 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 253305 ns 255431 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 13286897 ns 12961347 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 772413 ns 773598 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57958.5 ns 57520.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 45791.5 ns 46708 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39417 ns 39292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82625 ns 82500 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28551 ns 28213 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1363872 ns 1370930 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 74231 ns 76011 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2040292 ns 2032125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2064375 ns 2090250 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2084167 ns 2068583 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1983271 ns 1997000 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 225739 ns 223132 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35716396.5 ns 35910018 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1031871 ns 1194083 ns 0.86
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58333 ns 57812.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46834 ns 46708 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39667 ns 39583 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83000 ns 82375 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48471 ns 48361 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 789293.5 ns 762273.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 71026 ns 80795.5 ns 0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1926917 ns 1928084 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1963709 ns 1964958 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1974354 ns 1966541.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1891625 ns 1886625 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 232200 ns 230366 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 17717639 ns 16959659 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 916564 ns 920174 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 333 ns 0.87
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 33909 ns 33705 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1226571 ns 1253501.5 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 45910 ns 45940 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 5916 ns 6646 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7187.5 ns 7395.5 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7459 ns 7292 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6417 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 201066 ns 201838.5 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 20694042 ns 21257580 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 365424 ns 371664 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32008 ns 32336 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1150720 ns 1213220 ns 0.95
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 37940 ns 37120 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2709 ns 3292 ns 0.82
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3041 ns 3000 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3708 ns 3125 ns 1.19
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 3500 ns 2666 ns 1.31
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 181870 ns 182468 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7654347.5 ns 7479362 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 149631 ns 151261 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 491875 ns 502687.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 465938 ns 491916.5 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 469979 ns 465083.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 495375 ns 498417 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134587.5 ns 134412 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6261994 ns 5713043 ns 1.10
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 348083 ns 367259 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4056250 ns 4072041 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4071312.5 ns 4093021 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4083458.5 ns 4069979 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4067500 ns 4043667 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 675142 ns 669547 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34719295 ns 34596141 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1296728 ns 1474565 ns 0.88
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49815354 ns 49859062 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35531875 ns 35504667 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 25976083 ns 26029000 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96976979 ns 96942959 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1620332 ns 1621240 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/oneAPI 55439103 ns 55961032 ns 0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1059456 ns 1046111 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154432166.5 ns 154467896 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112364500.5 ns 112182625 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 88728958 ns 89208292 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 298587354.5 ns 294884062.5 ns 1.01
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6497993.5 ns 6486949 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/oneAPI 126106582 ns 128111295 ns 0.98
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5589506 ns 5579662.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 18292 ns 19541 ns 0.94
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 17542 ns 18625 ns 0.94
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 13625 ns 13917 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 16583.5 ns 15458.5 ns 1.07
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19675 ns 20271 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1142269.5 ns 1104775.5 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 27480 ns 26071 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 11000 ns 10729.5 ns 1.03
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 9020.5 ns 9000 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 7792 ns 8125 ns 0.96
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17375 ns 17291 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 242665 ns 244379 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 10148653 ns 10081500 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 144671.5 ns 148582 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7958.5 ns 8374.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9125 ns 8750 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10375 ns 10833 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7833.5 ns 9104.5 ns 0.86
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 117743.5 ns 120247 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3571636.5 ns 3746738 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 238312 ns 239122.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9083 ns 9437.5 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10188 ns 9708 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11500 ns 11792 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 9500 ns 1
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 580494.5 ns 585732.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 24076504 ns 22572008 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 649931.5 ns 659212 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9416 ns 9083.5 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9709 ns 9833.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10458 ns 10375 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9396 ns 9438 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 114984 ns 116564 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3341616 ns 3425324 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 71321 ns 75361 ns 0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13916.5 ns 13958 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13541.5 ns 13291.5 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17208.5 ns 16625 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13187.5 ns 13750 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 552056 ns 556648.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20781499.5 ns 19935565.5 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 344233 ns 351184 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 33628 ns 33504 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1186325 ns 1200134 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 207932 ns 207882 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7437 ns 7542 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8584 ns 7958 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9666 ns 9542 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7354.5 ns 7625 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 221757 ns 223084.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22841477 ns 21568038 ns 1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 657467 ns 665587 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16583 ns 17958 ns 0.92
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 16958 ns 17584 ns 0.96
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 12354 ns 13334 ns 0.93
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 11625 ns 10833.5 ns 1.07
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 19779 ns 20393 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1178666.5 ns 1168335 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 191642 ns 191442 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 35375 ns 35542 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 35479 ns 35583 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 35479.5 ns 36208 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 35584 ns 35500 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 258411 ns 258577 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11074698.5 ns 11381817 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 591756 ns 591656 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 449333 ns 511813 ns 0.88
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 450125 ns 447292 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 463875 ns 456792 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 486917 ns 517125 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194667 ns 194619 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5885088 ns 5685561 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 347133 ns 368453.5 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4054500 ns 4055479 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4060604.5 ns 4065479.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4063834 ns 4057292 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4052291.5 ns 4051125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 510233 ns 506270 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28172431.5 ns 28041384.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1353408.5 ns 1368029 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 780318375 ns 786875042 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 543371375 ns 540385750 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 415007687 ns 417627729 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1572225062.5 ns 1558687604 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22558969 ns 22789985.5 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/oneAPI 174041531 ns 176484643 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14555295 ns 14667995.5 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2500858833 ns 2512454792 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1786181583 ns 1772086292 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1510021583 ns 1545039084 ns 0.98
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 6317458166 ns 6322382417 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 119503116 ns 118300758 ns 1.01
batchedmm(512, Bsize=512)/zygote/GPU/oneAPI 931368955.5 ns 918719991.5 ns 1.01
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87832876 ns 87803948.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76375 ns 76458.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 77083 ns 76958 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 83334 ns 78437 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 75354 ns 76937.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 194473.5 ns 191503.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8155928 ns 8039760 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 106291 ns 106691 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 277375 ns 279042 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 193666.5 ns 208625 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 291542 ns 282125 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 203875 ns 196250 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 999103 ns 989645.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42482446 ns 44408111.5 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 628231.5 ns 636782 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199366166.5 ns 199893333 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139444084 ns 139025625 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 103950000 ns 104051042 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388306958 ns 388708625 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5837076.5 ns 5839621 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/oneAPI 78178829 ns 79074303 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3620336 ns 3603877.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 617703104.5 ns 619152625 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 438890042 ns 439143666 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 352507250 ns 353463000 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1183186458 ns 1177182375 ns 1.01
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26786910.5 ns 26537180.5 ns 1.01
batchedmm(512, Bsize=128)/zygote/GPU/oneAPI 274964991 ns 276530657.5 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21952578.5 ns 22057437 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7291 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 6167 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5417 ns 5375 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9917 ns 9792 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26517 ns 26296 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1160586 ns 1196971 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46431 ns 46670 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224854 ns 212500 ns 1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 230541 ns 219917 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229812.5 ns 223521 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207958 ns 208917 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 215879.5 ns 213879 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20490896 ns 20926055 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 528825 ns 531735 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6458 ns 8104 ns 0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9000 ns 8709 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9750 ns 10791.5 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8770.5 ns 9229 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 109989.5 ns 112861.5 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3318372 ns 3389305 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 72691 ns 73211 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7666.5 ns 7542 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8417 ns 7542 ns 1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 11750 ns 10229.5 ns 1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7562.5 ns 7834 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 485874.5 ns 490362 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19877956 ns 19246537 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 315043 ns 323133 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 458 ns 458 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 750 ns 500 ns 1.50
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 750 ns 708 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 459 ns 459 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 25151 ns 24659 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1214235 ns 1256249 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 48561 ns 48770 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8833 ns 9250 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9542 ns 8479.5 ns 1.13
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 11834 ns 12291 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 8250 ns 9083 ns 0.91
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 245667 ns 245415 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 23350383.5 ns 24116959 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 388103 ns 395734 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 111708 ns 112500.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 101708 ns 103271 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 87542 ns 88333 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 154542 ns 154625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 22556 ns 23200 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 822944.5 ns 818562 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 200302 ns 193152 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 576604.5 ns 578000 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 577208 ns 534875 ns 1.08
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 579583 ns 548917 ns 1.06
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 535334 ns 535333 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 215893 ns 215198 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11598893 ns 11436046 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 606916 ns 610641.5 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5500 ns 5000 ns 1.10
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 6187.5 ns 5416.5 ns 1.14
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7583 ns 7604.5 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 5646 ns 6625 ns 0.85
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16999 ns 17413 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/oneAPI 71875004 ns 72455521 ns 0.99
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 71250 ns 80361 ns 0.89
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 12166.5 ns 11792 ns 1.03
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10833.5 ns 10791.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11104 ns 11208 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 16667 ns 17000 ns 0.98
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 203355.5 ns 203659.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/oneAPI 97881235 ns 98210292 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 362713 ns 381654 ns 0.95
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 40375 ns 39542 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51334 ns 51459 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 51083 ns 51333 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13625 ns 13520.5 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA 21217 ns 19998 ns 1.06
batchedmm(16, Bsize=128)/forward/GPU/oneAPI 78292175 ns 76386107.5 ns 1.02
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 81245.5 ns 89551 ns 0.91
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 37437.5 ns 36229.5 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 31833.5 ns 31458 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 30145.5 ns 30250 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57333 ns 57167 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 180954 ns 180703 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/oneAPI 111821475 ns 112491463 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 393694 ns 412909.5 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1667 ns 1791 ns 0.93
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 1875 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2583 ns 2125 ns 1.22
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1813 ns 0.87
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 19103 ns 19867 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1181507 ns 1142759 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 29580 ns 34540 ns 0.86
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2291 ns 2042 ns 1.12
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2167 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2541 ns 2500 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2062.5 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 192587.5 ns 193884 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 9137253 ns 9110958 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 137661 ns 138796.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5041 ns 5791 ns 0.87
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4792 ns 4916 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6708 ns 6312.5 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5292 ns 4937.5 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 139532 ns 140483 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5873388 ns 5688843 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 61421 ns 70765.5 ns 0.87
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8250 ns 8375 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8583 ns 8292 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9333 ns 9917 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8062.5 ns 8291 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 812160 ns 811929.5 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 39105619 ns 40105318 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 390114 ns 393874 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55000 ns 55000 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 55875 ns 55833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 54333 ns 54292 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 56208 ns 56167 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36258 ns 36588.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1233762 ns 1189517 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 217247.5 ns 206632.5 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 523187.5 ns 486646 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 495646 ns 497020.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 509125 ns 505500 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 508354 ns 504479.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 258312 ns 256235 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27334844 ns 27551860 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 802628 ns 837064 ns 0.96
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3307500 ns 3311209 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2332208.5 ns 2324917 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 1767750 ns 1764917 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6289687.5 ns 6305667 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 205336 ns 204534 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/oneAPI 78138642 ns 77630538 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 213372 ns 220612.5 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11443687 ns 11424750.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8355854.5 ns 8337875 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 6598583.5 ns 6554562.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21066479 ns 21046187.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 735491 ns 736592 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/oneAPI 121355919 ns 121665223 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1063901 ns 1067736 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7208 ns 6375 ns 1.13
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6604 ns 5146 ns 1.28
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7708 ns 7333 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4708 ns 4917 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 130238.5 ns 130414 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5600093 ns 5600903.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 55701 ns 56000 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7604 ns 7500 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7562.5 ns 7104.5 ns 1.06
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8083 ns 7833 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7292 ns 6917 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 714522 ns 716948.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 35658157 ns 34048818 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 368784 ns 377284 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 98292 ns 100375 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 103667 ns 98042 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 127291 ns 101229 ns 1.26
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 122417 ns 121958 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 149309 ns 148678 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5831672 ns 5976414.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 183632 ns 203162 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2028041 ns 2025979.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2022292 ns 2023750 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2031625 ns 2027979 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2019021 ns 2028208 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 669751 ns 667124 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34116389.5 ns 32503605.5 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1113696 ns 1113981 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 32999.5 ns 34896 ns 0.95
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36208 ns 36541.5 ns 0.99
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 33125 ns 33000 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 542 ns 667 ns 0.81
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15437 ns 15608 ns 0.99
batchedmm(2, Bsize=4)/forward/GPU/oneAPI 72358742 ns 72119754.5 ns 1.00
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 84900 ns 83761 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2667 ns 2542 ns 1.05
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3000 ns 2875 ns 1.04
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3208 ns 3042 ns 1.05
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2250 ns 2125 ns 1.06
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 136315 ns 136848 ns 1.00
batchedmm(2, Bsize=4)/zygote/GPU/oneAPI 92893398 ns 92906510 ns 1.00
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 350423 ns 357139 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7250 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 6000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5416 ns 5417 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10167 ns 9875 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35436 ns 35691 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1228537 ns 1119535 ns 1.10
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49691 ns 49751 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 232749.5 ns 239895.5 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221125 ns 219708 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 227541.5 ns 222104 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 205750 ns 206166 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 240533 ns 239376 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26122810 ns 27974510.5 ns 0.93
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 509435 ns 574776 ns 0.89
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3958 ns 0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3750 ns 1.04
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21412 ns 22068 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2114597 ns 2145282 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42980 ns 42250 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14542 ns 14958 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14917 ns 14541 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14792 ns 14750 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14917 ns 14875 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 297410.5 ns 298530 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 10838818 ns 11632418 ns 0.93
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 196172 ns 196947 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 97937 ns 145083 ns 0.68
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 102750 ns 103646 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 130333 ns 105729.5 ns 1.23
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 127709 ns 113042 ns 1.13
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132466 ns 132784 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5909094 ns 6087845 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 182122 ns 204547 ns 0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1924333 ns 1918083 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1920667 ns 1923042 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1921792 ns 1921375 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1912771 ns 1925292 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 659652 ns 658916 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31062786 ns 30625432 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1217372 ns 1069806 ns 1.14
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17625 ns 20959 ns 0.84
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18666.5 ns 17979.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21834 ns 22125 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17125 ns 18125 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 103789.5 ns 104444.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3441121 ns 3374722 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 75841 ns 81701 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 229375 ns 229875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 217917 ns 223646 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 226458.5 ns 218125.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215521 ns 225125 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 496186 ns 492479 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 18765642 ns 19457097 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 473665 ns 483554.5 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 24313 ns 27374.5 ns 0.89
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 29875 ns 31063 ns 0.96
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 27375 ns 26708 ns 1.02
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1250 ns 1458 ns 0.86
batchedmm(16, Bsize=4)/forward/GPU/CUDA 15897 ns 15690 ns 1.01
batchedmm(16, Bsize=4)/forward/GPU/oneAPI 71655631.5 ns 73206765 ns 0.98
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 87071 ns 89171 ns 0.98
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 5375.5 ns 4875 ns 1.10
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5083.5 ns 4896 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5459 ns 5250 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4834 ns 4542 ns 1.06
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 200684.5 ns 200612 ns 1.00
batchedmm(16, Bsize=4)/zygote/GPU/oneAPI 92849344 ns 94501114 ns 0.98
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 389014 ns 394774 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 222083 ns 221875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 223166 ns 223209 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 224916.5 ns 225917 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 227000 ns 223750 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 219523 ns 216221 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7712821.5 ns 7634874 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 274002.5 ns 277862 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 495292 ns 535958 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 549771 ns 499104 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 507520.5 ns 510167 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 497583 ns 508166 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1034369 ns 1024022 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42519004 ns 45569833 ns 0.93
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 850318.5 ns 864044 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19708 ns 25166 ns 0.78
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21375 ns 20166.5 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22292 ns 21750 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24792 ns 19167 ns 1.29
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 111603.5 ns 111455.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3581394.5 ns 3479193 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 77006 ns 78821 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218812 ns 245354 ns 0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213041.5 ns 223375 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221958.5 ns 225417 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 250667 ns 218541 ns 1.15
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 710892 ns 707911 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24867084.5 ns 25617389 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 532655 ns 538875 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5959 ns 7125 ns 0.84
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6917 ns 6250 ns 1.11
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8708 ns 8666 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5917 ns 6458 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 131648 ns 132297.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5786966.5 ns 5594794 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 65661 ns 67671 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10584 ns 10583 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10729.5 ns 10250 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11541 ns 10958 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10541 ns 10875 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 772200 ns 778959.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 37330612 ns 37279902 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 385494 ns 393784 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4833 ns 5250 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6354.5 ns 6167 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6604.5 ns 7583 ns 0.87
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5041 ns 5208 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 133064 ns 134141.5 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5822443 ns 5548829 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 57140 ns 69361 ns 0.82
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7209 ns 7834 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7666 ns 7667 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8042 ns 8125 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7500 ns 7458 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 738153 ns 742994 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 40138762.5 ns 37148580 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 395034 ns 400934 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14423167 ns 14518042 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10121834 ns 10053875 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 7695041.5 ns 7724104 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27731208 ns 27741083 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 530060 ns 554321.5 ns 0.96
batchedmm(128, Bsize=512)/forward/GPU/oneAPI 94502665 ns 94275820 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 400144 ns 399814.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46295271.5 ns 46185458.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33585729.5 ns 33419604 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 26523271 ns 26602708.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85105834 ns 85208959 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2636621 ns 2813842 ns 0.94
batchedmm(128, Bsize=512)/zygote/GPU/oneAPI 190779173 ns 194819687 ns 0.98
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3293333 ns 3323814 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 67125 ns 69583 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 68791 ns 66979 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69875 ns 70292 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 67541 ns 67625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 116341 ns 102627 ns 1.13
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3481863 ns 3515302.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 238303 ns 232062 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 467979.5 ns 520062.5 ns 0.90
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 468833 ns 473208 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 479729 ns 482063 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 467333.5 ns 474708 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 704065 ns 703393 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26310960 ns 26797269 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 795648 ns 793873 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 541 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32111 ns 31962 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1221683 ns 1180122 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 47180 ns 47320 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8375 ns 8583 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9417 ns 9583.5 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9584 ns 9541 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8416 ns 9667 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 277435.5 ns 278738.5 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20099617 ns 21728099.5 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 375813.5 ns 381274 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9459 ns 9666 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9625 ns 9459 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9708 ns 9667 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9625 ns 9666 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 22950 ns 23100 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2089156.5 ns 2057483 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 212492 ns 212922 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 50167 ns 50458 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 50292 ns 50875 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 50541 ns 50375 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 50375 ns 50209 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 272026 ns 273986 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11125411 ns 11648854 ns 0.96
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 611216 ns 610646 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55250 ns 54917 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 55917 ns 55708 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 54375 ns 54292 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 56041 ns 55875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27749 ns 27572 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1229944.5 ns 1222185 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 214587 ns 206592 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 485479 ns 522166 ns 0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 496084 ns 504250 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 537000.5 ns 503500 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 461291.5 ns 472833.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 237315 ns 236683 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32908722.5 ns 32890414.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 839118 ns 889849 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 651166.5 ns 653833 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645917 ns 639812.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 662000 ns 654166.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 641417 ns 643729 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190601 ns 186765 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8668801 ns 8191594 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 229822 ns 303073 ns 0.76
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2241917 ns 2228375 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2232875 ns 2240916.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2250458.5 ns 2265312.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2234417 ns 2228084 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 914905 ns 907493 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 49141404 ns 49570533.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1359913 ns 1227082.5 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21375 ns 22083 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20938 ns 21333 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22583 ns 21416.5 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19167 ns 20208 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 109650 ns 108981.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3622083 ns 3615898 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 75660 ns 81661 ns 0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218833.5 ns 232104.5 ns 0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221084 ns 222250 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 235688 ns 228583 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221125.5 ns 259708 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 709252 ns 701359 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 25088612.5 ns 27641264 ns 0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 553695 ns 557775.5 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 541 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23372.5 ns 22562 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1180770.5 ns 1174965 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 49900 ns 48641 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9874.5 ns 9896 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9708 ns 10166 ns 0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10229.5 ns 9979.5 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9334 ns 10646 ns 0.88
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 259739 ns 259541 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 25804898 ns 25096956 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 401304 ns 406314 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9541 ns 10000 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9187.5 ns 8875 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10833 ns 10333 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8875 ns 9625 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 113457.5 ns 114946 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3378008 ns 3356422 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 69850 ns 75001 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7625 ns 7312.5 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7937.5 ns 7833 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8375 ns 7833 ns 1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7541.5 ns 7645.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 474853 ns 479855 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17576598 ns 17554055 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 322123 ns 327064 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1500 ns 1375 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1666.5 ns 1834 ns 0.91
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2187.5 ns 2125 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1542 ns 1708 ns 0.90
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 19317 ns 19733 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1172938.5 ns 1143637.5 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 192092 ns 192542 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3542 ns 3542 ns 1
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3625 ns 3584 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3833 ns 3875 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3500 ns 3500 ns 1
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 209093.5 ns 210034.5 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10006581.5 ns 10599117 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 581056 ns 584616 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148416 ns 148333.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 127541.5 ns 129000 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 107500 ns 107396 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225042 ns 233604.5 ns 0.96
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 22459 ns 23312 ns 0.96
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1201113 ns 1181923 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 37415.5 ns 41095.5 ns 0.91
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 143666.5 ns 161208.5 ns 0.89
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 110916 ns 140708 ns 0.79
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 100875 ns 104000 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 250834 ns 259375 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 206476 ns 208046 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10778609 ns 11091691.5 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 220822 ns 267983 ns 0.82
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 7270.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 5959 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 5333 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10041 ns 9959 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33038 ns 32872 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1161067.5 ns 1199319 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48271 ns 50331 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220021 ns 258729 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227708 ns 234500 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 243333 ns 238125 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212750 ns 253021 ns 0.84
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 256906 ns 256256.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27263274.5 ns 27890996 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 522055 ns 595296 ns 0.88
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12333 ns 13000 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 13020.5 ns 12396 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14333.5 ns 14500 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12917 ns 12500 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 131126.5 ns 131871 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5521631 ns 5626771 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 235402 ns 236102 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24520.5 ns 23854.5 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24187 ns 24500 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25354.5 ns 25187.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23625 ns 24750 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 816371.5 ns 821231 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 39369345 ns 40073814 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 684572 ns 689137 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9208 ns 9167 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10042 ns 9834 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11167 ns 11417 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9625 ns 8999.5 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 116949.5 ns 119274.5 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3478536 ns 3523753.5 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 70201 ns 76811 ns 0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14250 ns 14083 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13771 ns 14166.5 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15416 ns 15104 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13958 ns 14083 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 627909.5 ns 630553.5 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 21438120 ns 21897908 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 377354 ns 373463 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8958 ns 9021 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10437.5 ns 9875 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11750 ns 11250 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9166 ns 9750 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 115964 ns 117966.5 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3401614.5 ns 3400750 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 72371 ns 77501 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13208 ns 12854 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12854 ns 12937 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13958 ns 13187.5 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12416 ns 13166 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 516349.5 ns 522874 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19477250 ns 19612958 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 339683.5 ns 349524 ns 0.97
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 30291.5 ns 30958.5 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 34041.5 ns 34895.5 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 30042 ns 30208 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2083 ns 2042 ns 1.02
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16187 ns 16552 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/oneAPI 75928615 ns 76609794 ns 0.99
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 78561 ns 87451 ns 0.90
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5291.5 ns 5375 ns 0.98
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5499.5 ns 5229 ns 1.05
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5375 ns 5395.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6375 ns 6417 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 135964 ns 135958 ns 1.00
batchedmm(2, Bsize=128)/zygote/GPU/oneAPI 110752109 ns 111332262.5 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 382864 ns 390584 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 291 ns 291 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 417 ns 375 ns 1.11
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24855 ns 24266 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1239551 ns 1220615 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 48910 ns 49051 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6459 ns 6458 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6604 ns 6792 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7208.5 ns 6875 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6125 ns 6375 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 180794 ns 181716 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 24106911.5 ns 22738910 ns 1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 390139 ns 394694 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 2000 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2125 ns 2125 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2125 ns 2125 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2042 ns 1959 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25818 ns 25193 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1193002 ns 1233759.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 219547 ns 207422 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17500.5 ns 16937.5 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17833.5 ns 17583 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18437.5 ns 17666 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17500 ns 17167 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 264425 ns 266060 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 24505308 ns 25037224.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 705652 ns 702687 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 178208 ns 177959 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 165145.5 ns 151000 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 179042 ns 151250 ns 1.18
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 151292 ns 156666 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 187400 ns 185813 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7801096 ns 8186035 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 191502 ns 213762 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1317104 ns 1294417 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1320125 ns 1322667 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1331937 ns 1326979.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1318125.5 ns 1325125 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 859849 ns 850017 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 43918638 ns 46207436 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1005140 ns 1106552 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24084 ns 25687.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24708 ns 25000 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28063 ns 27125 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 26291.5 ns 27375 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 226248 ns 226385 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8086333 ns 7541451 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115141 ns 115741 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 160416.5 ns 180771 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 132958 ns 134583.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 127937.5 ns 175167 ns 0.73
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 124437.5 ns 164479 ns 0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 978646 ns 971603.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45755327 ns 45326263 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 587856 ns 614401.5 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22971 ns 22475 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1181802 ns 1258351.5 ns 0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 48630 ns 48960 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6333 ns 6458.5 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6729.5 ns 6875 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7291 ns 6875 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6541.5 ns 6458.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 197400 ns 197699 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 24703832 ns 25220935 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 392804 ns 395854 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5584 ns 5666 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6958 ns 6542 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8021 ns 6416 ns 1.25
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5875 ns 6167 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 135487.5 ns 136571.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5687030 ns 5759376 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 235072 ns 236832 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10083.5 ns 10167 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10458.5 ns 10250 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10500 ns 10708.5 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9833.5 ns 10021 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 841087.5 ns 843659.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 41023608 ns 42177959 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 675251.5 ns 680842 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 708 ns 708 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 708 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 750 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 708 ns 667 ns 1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22206 ns 22622 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2616381.5 ns 2092408 ns 1.25
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 209832 ns 211377.5 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4875 ns 4958 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4917 ns 5167 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5208 ns 5125 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4834 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 215367.5 ns 217676 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 11863776 ns 10379046 ns 1.14
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 591926 ns 586156 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7729.5 ns 7646 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7958 ns 8458 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9833 ns 10000.5 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7750.5 ns 8625 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 115622 ns 117310.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3536818 ns 3542404 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 71851 ns 77011 ns 0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8542 ns 8167 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8687.5 ns 8792 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9520.5 ns 9541 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8520.5 ns 8500 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 552863 ns 559897.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 20606879 ns 21100984 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 343673.5 ns 351894 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 127854 ns 129875 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 128834 ns 131334 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 96354 ns 98500 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 183167 ns 183000 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45982 ns 45933 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/oneAPI 72286847 ns 73470628 ns 0.98
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 95811 ns 104986 ns 0.91
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 330459 ns 320833 ns 1.03
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 332334 ns 340500 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 197417 ns 196229 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 571042 ns 614646 ns 0.93
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 183822.5 ns 184661 ns 1.00
batchedmm(128, Bsize=4)/zygote/GPU/oneAPI 93731117 ns 95503191 ns 0.98
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 473290 ns 520426 ns 0.91
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397125 ns 397833 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288229 ns 287792 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215375 ns 215167 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756375 ns 756459 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43348 ns 43884 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1384285 ns 1380208.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 79971 ns 82001 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1459375 ns 1449083 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1132396 ns 1131416 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 862770.5 ns 862375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2442500 ns 2444146 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 239777 ns 248740 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 13231788 ns 11082909 ns 1.19
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 351138.5 ns 350333 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 647458 ns 652083 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 649666 ns 652854 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 655021 ns 654417 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 641583.5 ns 661125 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 178508 ns 184615 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8381344 ns 8038741 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 240322 ns 311568 ns 0.77
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2454875 ns 2443958.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2450333 ns 2461416.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2461646 ns 2443812.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2458334 ns 2444771 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 938639.5 ns 932610 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52014786 ns 51927904 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1448719 ns 1324133 ns 1.09
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33146 ns 34083.5 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35708 ns 36437.5 ns 0.98
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 32000 ns 33771 ns 0.95
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 875 ns 834 ns 1.05
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15683 ns 15954 ns 0.98
batchedmm(2, Bsize=32)/forward/GPU/oneAPI 73122838 ns 74465713 ns 0.98
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 71645.5 ns 84121 ns 0.85
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3187.5 ns 3042 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3458 ns 3208 ns 1.08
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3541 ns 3416 ns 1.04
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3083 ns 3084 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 134592 ns 134871 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/oneAPI 97284653 ns 101832238 ns 0.96
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 337323.5 ns 355194 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 439375 ns 435000 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 440583 ns 441208 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 431375 ns 431291 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 450375 ns 449458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 42224 ns 42183 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1392161 ns 1418032 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 237893 ns 241737 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4138958 ns 4139000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4247291.5 ns 4281375 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4262792 ns 4272125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4028416.5 ns 4043500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 233746 ns 231383.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 36534446 ns 38875009 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1234322 ns 1238087.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3709 ns 3917 ns 0.95
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3750 ns 1.04
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3916 ns 3916 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34090 ns 34290 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1239089 ns 1242809 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 40520 ns 40730 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15291 ns 15750 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15958 ns 15500 ns 1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15750 ns 15708 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15667 ns 15667 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 251120.5 ns 253133 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8891050 ns 8969271 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 171192 ns 178362 ns 0.96
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404125 ns 404000 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295250 ns 295666 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 220625 ns 221167 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760666 ns 760500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113428 ns 113399 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1051037 ns 1019290 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 89110.5 ns 89320 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1479125 ns 1474312.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1156270.5 ns 1157021 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 886792 ns 884958 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2464333 ns 2465875 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 227639.5 ns 244167 ns 0.93
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 12228324 ns 11671477 ns 1.05
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 352474 ns 354019 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 625 ns 666 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24868 ns 24808 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1263047 ns 1214092.5 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 214292 ns 210112 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7541 ns 7916 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7917 ns 8167 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8250 ns 8125 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7667 ns 7459 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 202491 ns 203590.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25565257 ns 24613685 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 687187 ns 690937 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 830417 ns 832166.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 617334 ns 619583 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 467125 ns 472250 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1539875 ns 1542500 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130469 ns 130624 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/oneAPI 74138060 ns 75509279 ns 0.98
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 167662 ns 236082 ns 0.71
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2680895.5 ns 2694208.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1979750 ns 1991375 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1532167 ns 1537625 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4935708 ns 4930000 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 233179 ns 233850 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/oneAPI 101283369 ns 102808354 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 855698 ns 768638 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31956 ns 31761 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1162026.5 ns 1224489 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 49090 ns 47050 ns 1.04
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6187 ns 6417 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6770.5 ns 6792 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7042 ns 6666 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 217529.5 ns 219075.5 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22613407 ns 23474742 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 355723.5 ns 362424 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1750042 ns 1776458 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1774250 ns 1755459 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1759417 ns 1754000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1775625 ns 1755666 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 177451 ns 183229.5 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8059544 ns 8315915 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 355403 ns 375104 ns 0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4352125 ns 4353771 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4360770.5 ns 4398479 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4377083.5 ns 4376083 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4357583 ns 4351333 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 843625 ns 833369 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47645217 ns 47106002 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1390698 ns 1251643 ns 1.11
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 14562.5 ns 7083.5 ns 2.06
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 9667 ns 7104 ns 1.36
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 8292 ns 7375 ns 1.12
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6666.5 ns 6834 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 22207 ns 22695 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1231018 ns 1216626 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 37720 ns 37200 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 64458.5 ns 48479.5 ns 1.33
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 70792 ns 50874.5 ns 1.39
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 45708 ns 47979 ns 0.95
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 49521 ns 47208 ns 1.05
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 204835 ns 207872 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10627124.5 ns 10801241 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 233202 ns 234813 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21292 ns 22854 ns 0.93
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 24770.5 ns 26375 ns 0.94
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 22334 ns 23146 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 7416 ns 5333 ns 1.39
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17630 ns 17805 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/oneAPI 87889435 ns 89168517 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 90301 ns 90691 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12187 ns 12083 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10625 ns 10208.5 ns 1.04
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 9750 ns 9583 ns 1.02
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 18041.5 ns 18104.5 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 216733.5 ns 217973 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/oneAPI 151365483 ns 150119195 ns 1.01
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 384574 ns 389829 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 405417 ns 405958 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297333 ns 297166.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 223417 ns 223625 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762625 ns 762167 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46368 ns 46720 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1390104 ns 1360027 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 90091 ns 90521 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1487792 ns 1491042 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1159187.5 ns 1165750 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 892375 ns 892791.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2470895.5 ns 2470333 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 267416 ns 279542.5 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 13880635 ns 11213824.5 ns 1.24
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 378473.5 ns 375414 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 436458 ns 436000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 438916 ns 440750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 431708 ns 432000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 450167 ns 449042 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 53539 ns 54332 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1016245 ns 999725 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 235682 ns 237743 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4143041 ns 4137041.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4257999.5 ns 4271042 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4266292 ns 4270646 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4032437.5 ns 4030959 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 253837 ns 253348 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31122046.5 ns 32411933.5 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1206682 ns 1223273 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9208 ns 9458 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 8167 ns 8000 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7208 ns 7209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 13416 ns 13458 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 23370 ns 24044 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2190811 ns 2135292 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 212852 ns 214732 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 49416 ns 49833 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 50083 ns 49750 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 49541 ns 49458 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 49667 ns 49500 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 331181 ns 335918.5 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12227793 ns 12693187 ns 0.96
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 657676 ns 656617 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 123458 ns 136583 ns 0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 85271 ns 82145.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 127292 ns 85583 ns 1.49
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 108541.5 ns 83104 ns 1.31
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191180.5 ns 191318.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6110005 ns 5843078 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 200667 ns 205972 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2014999.5 ns 2013959 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1877583 ns 2017792 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2016083 ns 2022958 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2015916 ns 2019333 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 510301 ns 508706 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27606531 ns 28081381 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 943229 ns 1089431 ns 0.87

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.