This repository has been archived by the owner on Nov 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
8 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "LuxLib" | ||
uuid = "82251201-b29d-42c6-8e01-566dec8acb11" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.3.2" | ||
version = "1.3.3" | ||
|
||
[deps] | ||
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
301b59c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
301b59c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/117022
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
301b59c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LuxLib Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5125
ns5209
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6937.5
ns5208
ns1.33
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7417
ns7291
ns1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6083
ns6208
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
104885
ns115729
ns0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
2678307
ns2692776
ns0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
401685
ns408504
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9917
ns10083
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10042
ns10208
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10750
ns10375
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9729
ns9833
ns0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
495998
ns496762
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
18744208
ns17703724
ns1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
680377
ns10961843
ns0.06206775630703706
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1458
ns1312
ns1.11
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1541.5
ns1500
ns1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1750
ns1875
ns0.93
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3187.5
ns1479.5
ns2.15
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
20316
ns20353.5
ns1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI
1305124
ns1346068.5
ns0.97
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU
31190.5
ns31961
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4334
ns4000
ns1.08
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4041
ns4416
ns0.92
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4083
ns4500
ns0.91
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
4354
ns4333
ns1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
134077
ns133606
ns1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI
8979794
ns9495102
ns0.95
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU
148416.5
ns147546.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57500
ns57500
ns1
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46667
ns46333
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
39917
ns39750
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83500
ns82562.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37564
ns36967.5
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
567840.5
ns548600
ns1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
80616
ns80581
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2038666
ns2024000
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2081166
ns2088104
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2084042
ns2081875
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1991875
ns1983520.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
223666
ns218972
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
7677352
ns7891968
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1187113
ns973560
ns1.22
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
146541
ns145834
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
148041.5
ns172583
ns0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
151625
ns151875.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
176750
ns176250
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
166355.5
ns167986
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7478548
ns7801350.5
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
190117
ns197777
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1106833.5
ns1108729.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1109708
ns1105292
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1125750
ns1119062.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1112687.5
ns1108749.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
654461
ns642887
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
33783553
ns33405409
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1021271
ns1027070
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5333
ns6083
ns0.88
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5125
ns4937.5
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5750
ns5896
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5084
ns5750
ns0.88
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
83746
ns83848
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
5563998.5
ns5356951.5
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
61491
ns69841
ns0.88
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8792
ns9000
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8625
ns9042
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9250
ns9042
ns1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8417
ns8542
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
559136
ns556012
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
34995936.5
ns37949872
ns0.92
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
392504
ns395964
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17083
ns18791
ns0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18000
ns16875
ns1.07
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18791.5
ns20917
ns0.90
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17708.5
ns22791.5
ns0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
63135.5
ns61826
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3027434.5
ns3296125
ns0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
74881
ns76391
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
218791
ns211083
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
212063
ns218583.5
ns0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213375
ns221999.5
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
218250
ns211500
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
334874
ns328054
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
15538427
ns14617604.5
ns1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
465885
ns468680
ns0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
625
ns750
ns0.83
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
708
ns666.5
ns1.06
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
792
ns917
ns0.86
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
667
ns625
ns1.07
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
19376
ns19270
ns1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI
1181689
ns1164614.5
ns1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU
30801
ns31200
ns0.99
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1375
ns1459
ns0.94
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1417
ns1417
ns1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1500
ns1500
ns1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1375
ns1459
ns0.94
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
115818
ns115345.5
ns1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI
8578264
ns8786881.5
ns0.98
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU
125221.5
ns136362
ns0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7291
ns7333
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6125
ns5958
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5458
ns5458
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10375
ns10167
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
24404
ns23777
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1185331.5
ns1195053
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
47150
ns49421
ns0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
259875
ns228791
ns1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
239750
ns262833
ns0.91
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
238375
ns244208
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
212937.5
ns227438
ns0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
194467.5
ns188310
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
30488731
ns30683195
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
603521
ns646667
ns0.93
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3958
ns4125
ns0.96
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4084
ns3916
ns1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4125
ns4125
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4084
ns4083
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23361
ns23548.5
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI
1914869
ns2046712.5
ns0.94
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU
47581
ns49050
ns0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16958
ns16750
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16875
ns16833
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16667
ns16833
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16750
ns17000
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
186194.5
ns184716.5
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI
9861733
ns10810606
ns0.91
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU
172361.5
ns178062
ns0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
490917
ns491291
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
385541
ns385708
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
313292
ns313250
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
846958.5
ns846667
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113486.5
ns113504.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI
398692.5
ns400320
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU
245177.5
ns243402
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2139937
ns2157041.5
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1863583
ns1860000
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1584583.5
ns1596917
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3114083
ns3118291.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
229713.5
ns228877.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI
11955773.5
ns9523997.5
ns1.26
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
745073
ns743298
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
7104
ns6541.5
ns1.09
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6792
ns6167
ns1.10
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7083
ns7145.5
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6229.5
ns6416
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
83179
ns82766.5
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
6726845
ns5786455
ns1.16
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
59261
ns67260
ns0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10250
ns11708.5
ns0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11458
ns10333
ns1.11
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11895.5
ns12417
ns0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11166.5
ns10375
ns1.08
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
592614
ns599572.5
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
37936205.5
ns36065836.5
ns1.05
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
410389
ns415124
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
542
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
500
ns541
ns0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
583
ns542
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23257
ns23681.5
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI
2214765
ns2157030
ns1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU
48421
ns49180
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2167
ns2125
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2125
ns2167
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2208
ns2208
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2125
ns2125
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
230148
ns230420
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI
11848732
ns10946869
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU
178962
ns182202
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8917
ns9208
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8917
ns8666.5
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
10083
ns9917
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
9208
ns8792
ns1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
99883.5
ns100396.5
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
3281834
ns3318002.5
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
73811
ns75271
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17438
ns17229.5
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17125
ns18479.5
ns0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
19125
ns18625
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17375
ns18000
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
574862.5
ns575393.5
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
17368412
ns16729549.5
ns1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
382279
ns385864
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
459
ns500
ns0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
583
ns583
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns583
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns583
ns0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
34631
ns34044
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
1211808
ns1236371
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
48701
ns48691
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9146
ns9625.5
ns0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9521
ns9541.5
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10229.5
ns9709
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8604
ns8833.5
ns0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
260130.5
ns254859
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
19439989.5
ns19246352.5
ns1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
367164
ns375034
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
396854.5
ns397208
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288229.5
ns287667
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
215042
ns215291
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
755958
ns755625
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
112250
ns112458
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI
328996
ns340204
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU
75451
ns76851
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1462500
ns1468271
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1136041
ns1130458
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
860334
ns858125
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2439875
ns2440187.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
199853.5
ns199457
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI
9985334
ns9886202
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU
324698
ns322043
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7000
ns8021.5
ns0.87
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7687.5
ns7875
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8375
ns8750
ns0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7499.5
ns7125
ns1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
138856
ns134916.5
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
6055720
ns5780710
ns1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
60111
ns70255.5
ns0.86
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15874.5
ns16917
ns0.94
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16271
ns15042
ns1.08
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15792
ns15979
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13125.5
ns16000
ns0.82
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
911828
ns878404
ns1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
42608795.5
ns41935612.5
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
429664
ns433994
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24000
ns28792
ns0.83
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
25958
ns25792
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
26833.5
ns28833.5
ns0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24937.5
ns30354.5
ns0.82
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
189463
ns183000.5
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7536335
ns7959277.5
ns0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
112782
ns115401
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
146084
ns112375
ns1.30
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
152541.5
ns144438
ns1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
105833
ns105854.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
153500
ns150875
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1027043
ns977911
ns1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
41813684
ns41813067
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
587426
ns589736
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
74042
ns74166
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
84500
ns74604
ns1.13
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
74917
ns77333
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
74333
ns76334
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
195104
ns189045
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7388961
ns7503392
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
121551
ns128881
ns0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
281250
ns295667
ns0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
290833
ns307166
ns0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
244667
ns300000
ns0.82
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
297125
ns276875.5
ns1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1044893
ns986480
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
40287331
ns40933470
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
693978
ns697017.5
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
12583.5
ns13166.5
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
13333.5
ns13229
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14000
ns14833.5
ns0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
13125
ns13667
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
137568.5
ns133538.5
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
5655781
ns5773755.5
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
235892
ns236113
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
27458
ns27000
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
28437
ns27500
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27583
ns27187.5
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
25396
ns27438
ns0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
925629.5
ns917467.5
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
42183215
ns39999839
ns1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
696807
ns698258
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
10583.5
ns11209
ns0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
11729
ns11292
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14020.5
ns13375
ns1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
11166
ns11083
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
119207
ns119722.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
3459447
ns3349179
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
241797.5
ns240142
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
22228.5
ns23333
ns0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
22979
ns23084
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
24041
ns24000
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
22958
ns21958
ns1.05
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
679984
ns678230.5
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
21093495.5
ns22343314.5
ns0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
675492
ns678857
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
65145.5
ns65021
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
69062
ns62875
ns1.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
67375
ns68667
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
63250
ns66417
ns0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
102654.5
ns101393
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3365331
ns3400903
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
244962
ns236963
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
512250
ns477895.5
ns1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
511875
ns476959
ns1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
467958.5
ns468750
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
464791
ns495833
ns0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
497974
ns488817
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
19959026
ns20464230
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
716037
ns715823
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7458
ns7146
ns1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7479.5
ns8375
ns0.89
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8791.5
ns8500
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7000
ns7021
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
136611.5
ns136539.5
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
5668588
ns5535345
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
59181
ns69291
ns0.85
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
16084
ns11458
ns1.40
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16104
ns14500
ns1.11
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15145.5
ns16125
ns0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15292
ns13416
ns1.14
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
892529
ns886518
ns1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
37494483
ns37792827
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
399300
ns407319.5
ns0.98
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
6148250
ns6154209
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
6373958.5
ns6370021
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
3229667
ns3225542
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
11910625
ns11912875
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
348836
ns345647
ns1.01
batchedmm(512, Bsize=4)/forward/GPU/oneAPI
48313142
ns49342806
ns0.98
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU
303493
ns305758
ns0.99
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
19111312.5
ns19108188
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
19956500
ns19939624.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
11118833
ns11149250
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
36495125
ns36445875
ns1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1010983
ns1059965
ns0.95
batchedmm(512, Bsize=4)/zygote/GPU/oneAPI
77165819
ns79558988
ns0.97
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU
1185177
ns1166672
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1000
ns1000
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1042
ns1000
ns1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1042
ns1083
ns0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
959
ns959
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23306
ns23689
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI
2102392
ns2151476.5
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU
210582
ns209622
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
3958
ns3917
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
3959
ns4041
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4041
ns4000
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
3958
ns3916
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
274898
ns274634
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10835037
ns10742838
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
633051.5
ns625596
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7667
ns7292
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8271
ns9000
ns0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
10167
ns10250
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7999.5
ns9062.5
ns0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
116562
ns116615
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
3281805
ns3546009
ns0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
68781
ns69341
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11833
ns12000
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
12271
ns12667
ns0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
13500
ns12437.5
ns1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
12209
ns12417
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
610392
ns605595
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
20835527
ns22519876
ns0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
356904
ns363803
ns0.98
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
291
ns250
ns1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
291
ns250
ns1.16
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22489
ns22597.5
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI
2031329
ns2178291
ns0.93
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU
49170
ns48315.5
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
3042
ns2834
ns1.07
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2958
ns2916
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3209
ns3167
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2834
ns3083
ns0.92
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
196092.5
ns194557
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI
9721843.5
ns9614403
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU
166151.5
ns170192
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
11542
ns11333
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
12584
ns11459
ns1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
13333.5
ns13708
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10959
ns12333
ns0.89
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
115616.5
ns115903.5
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
3435294.5
ns3311083
ns1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
238972
ns239372.5
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
22250
ns20792
ns1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
22583.5
ns23500
ns0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
22875
ns22395.5
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
23104.5
ns21458.5
ns1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
561561
ns558538
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
19972002
ns19541146
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
652206
ns657037
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4167
ns4375
ns0.95
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4375
ns4167
ns1.05
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4458
ns4417
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4375
ns4375
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
24400
ns24750
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI
2160362
ns2038545
ns1.06
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU
49090
ns49870
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16167
ns16708
ns0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16625
ns16167
ns1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16291
ns16500
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16584
ns16667
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
320232
ns317514
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI
12103289.5
ns12292699
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU
205902
ns212047.5
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
2084
ns2083
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
2209
ns2125
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
2167
ns2125
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
2125
ns2083
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
35395
ns35083
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
1121264.5
ns1184726
ns0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
218222
ns206953
ns1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
17896
ns17250
ns1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
17916
ns18667
ns0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
19125
ns19584
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
18146
ns20125
ns0.90
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
286121
ns284678
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20551833.5
ns20274746
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
685457
ns691617
ns0.99
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
60208.5
ns60292
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
65458
ns66792
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
60938
ns62000
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
53875
ns51125
ns1.05
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66633
ns66448
ns1.00
batchedmm(16, Bsize=512)/forward/GPU/oneAPI
86298273
ns87696389
ns0.98
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU
102431
ns117412
ns0.87
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
197791.5
ns198916
ns0.99
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
162042
ns167229
ns0.97
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
137250
ns141417
ns0.97
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
295208
ns300125
ns0.98
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
211289
ns209004
ns1.01
batchedmm(16, Bsize=512)/zygote/GPU/oneAPI
152039178
ns147263909.5
ns1.03
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU
510905
ns620696.5
ns0.82
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
123834
ns82583
ns1.50
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
123125
ns140250
ns0.88
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
84312.5
ns86417
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
90875
ns116583
ns0.78
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193182.5
ns191982.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5322780
ns5863118
ns0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
192502
ns203942
ns0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1921875
ns1921771
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1909416
ns1908917
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1888250
ns1919708
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1881750
ns1924521
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
510619
ns504208.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
26283882
ns26294676.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
911709
ns1070976
ns0.85
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns291
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
21603
ns21855
ns0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI
2089663
ns2006228
ns1.04
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU
42021
ns41700
ns1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1833
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1833
ns1833
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1792
ns1792
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
246530.5
ns242053
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI
9718939
ns10350039
ns0.94
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU
183711
ns183192
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
8083
ns9833
ns0.82
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9791
ns9833
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
12125
ns11709
ns1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8458
ns10583
ns0.80
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
115667.5
ns116639.5
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
3479265.5
ns3403003.5
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
238712
ns238567.5
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10334
ns8875
ns1.16
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10375
ns10875
ns0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10709
ns10125
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10750
ns9500
ns1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
493762.5
ns488952
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
19419012
ns20132943
ns0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
631376
ns630866
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58459
ns57875
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46541
ns46958
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
39791
ns39625
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82958
ns82250
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
39195
ns38551
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1326636
ns1316937
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
77861
ns79411
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1927333.5
ns1922646
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1977312
ns1979292
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1955167
ns1942292
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1892417
ns1900917
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
217765.5
ns210456
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
33483865
ns33978774
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1004015.5
ns1015680
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
267875
ns267333
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
277417
ns269625
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
270958
ns270729.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
278250
ns269645.5
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
198525
ns192987.5
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7684906
ns7844239
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
283563
ns285143
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
614937.5
ns698604
ns0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
658104
ns671916.5
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
590146
ns667416
ns0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
646750.5
ns626771
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1004951
ns985897
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
44721716
ns45574369
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
899859
ns913670
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2206250
ns2218667
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2176625
ns2215687
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2107416
ns2220312.5
ns0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2210708
ns2213250
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
158799
ns157769
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8305150
ns8237698
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
412934
ns425304
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5495166.5
ns5486562
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5498084
ns5529917
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5497292
ns5524333.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5479145.5
ns5488625
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
942447
ns927722
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
52379643
ns53249072
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1717957
ns1555466
ns1.10
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
476375
ns478042
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
344833
ns346167
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
255667
ns257167
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
909083
ns909250
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46257.5
ns46497
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI
876632
ns825183
ns1.06
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU
245143
ns245473
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2148125
ns2167292
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1855417
ns1862208
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1588042
ns1591771
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3122292
ns3122542
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
253305
ns255431
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI
13286897
ns12961347
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
772413
ns773598
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57958.5
ns57520.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
45791.5
ns46708
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
39417
ns39292
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82625
ns82500
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
28551
ns28213
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1363872
ns1370930
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
74231
ns76011
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2040292
ns2032125
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2064375
ns2090250
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2084167
ns2068583
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1983271
ns1997000
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
225739
ns223132
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
35716396.5
ns35910018
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1031871
ns1194083
ns0.86
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58333
ns57812.5
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46834
ns46708
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
39667
ns39583
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83000
ns82375
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
48471
ns48361
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
789293.5
ns762273.5
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
71026
ns80795.5
ns0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1926917
ns1928084
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1963709
ns1964958
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1974354
ns1966541.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1891625
ns1886625
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
232200
ns230366
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
17717639
ns16959659
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
916564
ns920174
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
291
ns333
ns0.87
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
416
ns375
ns1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
33909
ns33705
ns1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
1226571
ns1253501.5
ns0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
45910
ns45940
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
5916
ns6646
ns0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7187.5
ns7395.5
ns0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7459
ns7292
ns1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6333
ns6417
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
201066
ns201838.5
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
20694042
ns21257580
ns0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
365424
ns371664
ns0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32008
ns32336
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI
1150720
ns1213220
ns0.95
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU
37940
ns37120
ns1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2709
ns3292
ns0.82
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
3041
ns3000
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
3708
ns3125
ns1.19
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
3500
ns2666
ns1.31
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
181870
ns182468
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI
7654347.5
ns7479362
ns1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU
149631
ns151261
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
491875
ns502687.5
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
465938
ns491916.5
ns0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
469979
ns465083.5
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
495375
ns498417
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
134587.5
ns134412
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6261994
ns5713043
ns1.10
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
348083
ns367259
ns0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4056250
ns4072041
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4071312.5
ns4093021
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4083458.5
ns4069979
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4067500
ns4043667
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
675142
ns669547
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
34719295
ns34596141
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1296728
ns1474565
ns0.88
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
49815354
ns49859062
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
35531875
ns35504667
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
25976083
ns26029000
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
96976979
ns96942959
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1620332
ns1621240
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/oneAPI
55439103
ns55961032
ns0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU
1059456
ns1046111
ns1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
154432166.5
ns154467896
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
112364500.5
ns112182625
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
88728958
ns89208292
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
298587354.5
ns294884062.5
ns1.01
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6497993.5
ns6486949
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/oneAPI
126106582
ns128111295
ns0.98
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU
5589506
ns5579662.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
18292
ns19541
ns0.94
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
17542
ns18625
ns0.94
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
13625
ns13917
ns0.98
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
16583.5
ns15458.5
ns1.07
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
19675
ns20271
ns0.97
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI
1142269.5
ns1104775.5
ns1.03
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU
27480
ns26071
ns1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
11000
ns10729.5
ns1.03
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
9020.5
ns9000
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
7792
ns8125
ns0.96
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
17375
ns17291
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
242665
ns244379
ns0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI
10148653
ns10081500
ns1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU
144671.5
ns148582
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7958.5
ns8374.5
ns0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
9125
ns8750
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
10375
ns10833
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7833.5
ns9104.5
ns0.86
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
117743.5
ns120247
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
3571636.5
ns3746738
ns0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
238312
ns239122.5
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9083
ns9437.5
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10188
ns9708
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11500
ns11792
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9500
ns9500
ns1
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
580494.5
ns585732.5
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
24076504
ns22572008
ns1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
649931.5
ns659212
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9416
ns9083.5
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9709
ns9833.5
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10458
ns10375
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9396
ns9438
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
114984
ns116564
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
3341616
ns3425324
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
71321
ns75361
ns0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13916.5
ns13958
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13541.5
ns13291.5
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
17208.5
ns16625
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13187.5
ns13750
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
552056
ns556648.5
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20781499.5
ns19935565.5
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
344233
ns351184
ns0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
500
ns500
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
625
ns625
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
542
ns500
ns1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
33628
ns33504
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI
1186325
ns1200134
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
207932
ns207882
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7437
ns7542
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8584
ns7958
ns1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9666
ns9542
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7354.5
ns7625
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
221757
ns223084.5
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI
22841477
ns21568038
ns1.06
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
657467
ns665587
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
16583
ns17958
ns0.92
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
16958
ns17584
ns0.96
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
12354
ns13334
ns0.93
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
11625
ns10833.5
ns1.07
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
19779
ns20393
ns0.97
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI
1178666.5
ns1168335
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU
191642
ns191442
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
35375
ns35542
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
35479
ns35583
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
35479.5
ns36208
ns0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
35584
ns35500
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
258411
ns258577
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI
11074698.5
ns11381817
ns0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
591756
ns591656
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
449333
ns511813
ns0.88
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
450125
ns447292
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
463875
ns456792
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
486917
ns517125
ns0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194667
ns194619
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5885088
ns5685561
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
347133
ns368453.5
ns0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4054500
ns4055479
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4060604.5
ns4065479.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4063834
ns4057292
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4052291.5
ns4051125
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
510233
ns506270
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
28172431.5
ns28041384.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1353408.5
ns1368029
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
780318375
ns786875042
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
543371375
ns540385750
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
415007687
ns417627729
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
1572225062.5
ns1558687604
ns1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22558969
ns22789985.5
ns0.99
batchedmm(512, Bsize=512)/forward/GPU/oneAPI
174041531
ns176484643
ns0.99
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU
14555295
ns14667995.5
ns0.99
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
2500858833
ns2512454792
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1786181583
ns1772086292
ns1.01
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1510021583
ns1545039084
ns0.98
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
6317458166
ns6322382417
ns1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
119503116
ns118300758
ns1.01
batchedmm(512, Bsize=512)/zygote/GPU/oneAPI
931368955.5
ns918719991.5
ns1.01
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU
87832876
ns87803948.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76375
ns76458.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
77083
ns76958
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
83334
ns78437
ns1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
75354
ns76937.5
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
194473.5
ns191503.5
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
8155928
ns8039760
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
106291
ns106691
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
277375
ns279042
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
193666.5
ns208625
ns0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
291542
ns282125
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
203875
ns196250
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
999103
ns989645.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
42482446
ns44408111.5
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
628231.5
ns636782
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
199366166.5
ns199893333
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
139444084
ns139025625
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
103950000
ns104051042
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
388306958
ns388708625
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5837076.5
ns5839621
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/oneAPI
78178829
ns79074303
ns0.99
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU
3620336
ns3603877.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
617703104.5
ns619152625
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
438890042
ns439143666
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
352507250
ns353463000
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
1183186458
ns1177182375
ns1.01
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
26786910.5
ns26537180.5
ns1.01
batchedmm(512, Bsize=128)/zygote/GPU/oneAPI
274964991
ns276530657.5
ns0.99
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU
21952578.5
ns22057437
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7250
ns7291
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6125
ns6167
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5417
ns5375
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9917
ns9792
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26517
ns26296
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1160586
ns1196971
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
46431
ns46670
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
224854
ns212500
ns1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
230541
ns219917
ns1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229812.5
ns223521
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
207958
ns208917
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
215879.5
ns213879
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
20490896
ns20926055
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
528825
ns531735
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6458
ns8104
ns0.80
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9000
ns8709
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
9750
ns10791.5
ns0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8770.5
ns9229
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
109989.5
ns112861.5
ns0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
3318372
ns3389305
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
72691
ns73211
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7666.5
ns7542
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8417
ns7542
ns1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
11750
ns10229.5
ns1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7562.5
ns7834
ns0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
485874.5
ns490362
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
19877956
ns19246537
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
315043
ns323133
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
458
ns458
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
750
ns500
ns1.50
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
750
ns708
ns1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
459
ns459
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
25151
ns24659
ns1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
1214235
ns1256249
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
48561
ns48770
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
8833
ns9250
ns0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9542
ns8479.5
ns1.13
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
11834
ns12291
ns0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
8250
ns9083
ns0.91
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
245667
ns245415
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
23350383.5
ns24116959
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
388103
ns395734
ns0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
111708
ns112500.5
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
101708
ns103271
ns0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
87542
ns88333
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
154542
ns154625
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
22556
ns23200
ns0.97
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI
822944.5
ns818562
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU
200302
ns193152
ns1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
576604.5
ns578000
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
577208
ns534875
ns1.08
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
579583
ns548917
ns1.06
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
535334
ns535333
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
215893
ns215198
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI
11598893
ns11436046
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
606916
ns610641.5
ns0.99
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
5500
ns5000
ns1.10
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
6187.5
ns5416.5
ns1.14
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
7583
ns7604.5
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
5646
ns6625
ns0.85
batchedmm(16, Bsize=32)/forward/GPU/CUDA
16999
ns17413
ns0.98
batchedmm(16, Bsize=32)/forward/GPU/oneAPI
71875004
ns72455521
ns0.99
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU
71250
ns80361
ns0.89
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
12166.5
ns11792
ns1.03
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
10833.5
ns10791.5
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
11104
ns11208
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
16667
ns17000
ns0.98
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
203355.5
ns203659.5
ns1.00
batchedmm(16, Bsize=32)/zygote/GPU/oneAPI
97881235
ns98210292
ns1.00
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU
362713
ns381654
ns0.95
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
40375
ns39542
ns1.02
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
51334
ns51459
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
51083
ns51333
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
13625
ns13520.5
ns1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA
21217
ns19998
ns1.06
batchedmm(16, Bsize=128)/forward/GPU/oneAPI
78292175
ns76386107.5
ns1.02
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU
81245.5
ns89551
ns0.91
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
37437.5
ns36229.5
ns1.03
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
31833.5
ns31458
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
30145.5
ns30250
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
57333
ns57167
ns1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
180954
ns180703
ns1.00
batchedmm(16, Bsize=128)/zygote/GPU/oneAPI
111821475
ns112491463
ns0.99
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU
393694
ns412909.5
ns0.95
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
1667
ns1791
ns0.93
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1875
ns0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
2583
ns2125
ns1.22
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
1583
ns1813
ns0.87
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
19103
ns19867
ns0.96
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI
1181507
ns1142759
ns1.03
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU
29580
ns34540
ns0.86
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
2291
ns2042
ns1.12
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
2167
ns2167
ns1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
2541
ns2500
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
2125
ns2062.5
ns1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
192587.5
ns193884
ns0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI
9137253
ns9110958
ns1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU
137661
ns138796.5
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5041
ns5791
ns0.87
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4792
ns4916
ns0.97
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6708
ns6312.5
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5292
ns4937.5
ns1.07
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
139532
ns140483
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
5873388
ns5688843
ns1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
61421
ns70765.5
ns0.87
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8250
ns8375
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8583
ns8292
ns1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9333
ns9917
ns0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8062.5
ns8291
ns0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
812160
ns811929.5
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
39105619
ns40105318
ns0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
390114
ns393874
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
55000
ns55000
ns1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
55875
ns55833
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
54333
ns54292
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
56208
ns56167
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
36258
ns36588.5
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1233762
ns1189517
ns1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
217247.5
ns206632.5
ns1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
523187.5
ns486646
ns1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
495646
ns497020.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
509125
ns505500
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
508354
ns504479.5
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
258312
ns256235
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
27334844
ns27551860
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
802628
ns837064
ns0.96
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
3307500
ns3311209
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
2332208.5
ns2324917
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
1767750
ns1764917
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
6289687.5
ns6305667
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
205336
ns204534
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/oneAPI
78138642
ns77630538
ns1.01
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU
213372
ns220612.5
ns0.97
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
11443687
ns11424750.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
8355854.5
ns8337875
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
6598583.5
ns6554562.5
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
21066479
ns21046187.5
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
735491
ns736592
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/oneAPI
121355919
ns121665223
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU
1063901
ns1067736
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7208
ns6375
ns1.13
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6604
ns5146
ns1.28
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7708
ns7333
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4708
ns4917
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
130238.5
ns130414
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
5600093
ns5600903.5
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
55701
ns56000
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7604
ns7500
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7562.5
ns7104.5
ns1.06
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8083
ns7833
ns1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7292
ns6917
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
714522
ns716948.5
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
35658157
ns34048818
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
368784
ns377284
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
98292
ns100375
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
103667
ns98042
ns1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
127291
ns101229
ns1.26
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
122417
ns121958
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
149309
ns148678
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5831672
ns5976414.5
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
183632
ns203162
ns0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2028041
ns2025979.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2022292
ns2023750
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2031625
ns2027979
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2019021
ns2028208
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
669751
ns667124
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
34116389.5
ns32503605.5
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1113696
ns1113981
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
32999.5
ns34896
ns0.95
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
36208
ns36541.5
ns0.99
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
33125
ns33000
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
542
ns667
ns0.81
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15437
ns15608
ns0.99
batchedmm(2, Bsize=4)/forward/GPU/oneAPI
72358742
ns72119754.5
ns1.00
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU
84900
ns83761
ns1.01
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2667
ns2542
ns1.05
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
3000
ns2875
ns1.04
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
3208
ns3042
ns1.05
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2250
ns2125
ns1.06
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
136315
ns136848
ns1.00
batchedmm(2, Bsize=4)/zygote/GPU/oneAPI
92893398
ns92906510
ns1.00
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU
350423
ns357139
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7208
ns7250
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6083
ns6000
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5416
ns5417
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10167
ns9875
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35436
ns35691
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1228537
ns1119535
ns1.10
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
49691
ns49751
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
232749.5
ns239895.5
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
221125
ns219708
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
227541.5
ns222104
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
205750
ns206166
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
240533
ns239376
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26122810
ns27974510.5
ns0.93
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
509435
ns574776
ns0.89
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3750
ns3958
ns0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3917
ns3750
ns1.04
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3958
ns3958
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3958
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
21412
ns22068
ns0.97
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI
2114597
ns2145282
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU
42980
ns42250
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14542
ns14958
ns0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14917
ns14541
ns1.03
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14792
ns14750
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14917
ns14875
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
297410.5
ns298530
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI
10838818
ns11632418
ns0.93
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU
196172
ns196947
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
97937
ns145083
ns0.68
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
102750
ns103646
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
130333
ns105729.5
ns1.23
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
127709
ns113042
ns1.13
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132466
ns132784
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
5909094
ns6087845
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
182122
ns204547
ns0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1924333
ns1918083
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1920667
ns1923042
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1921792
ns1921375
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1912771
ns1925292
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
659652
ns658916
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31062786
ns30625432
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1217372
ns1069806
ns1.14
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17625
ns20959
ns0.84
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18666.5
ns17979.5
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21834
ns22125
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17125
ns18125
ns0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
103789.5
ns104444.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3441121
ns3374722
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
75841
ns81701
ns0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
229375
ns229875
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
217917
ns223646
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
226458.5
ns218125.5
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215521
ns225125
ns0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
496186
ns492479
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
18765642
ns19457097
ns0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
473665
ns483554.5
ns0.98
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
24313
ns27374.5
ns0.89
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
29875
ns31063
ns0.96
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
27375
ns26708
ns1.02
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
1250
ns1458
ns0.86
batchedmm(16, Bsize=4)/forward/GPU/CUDA
15897
ns15690
ns1.01
batchedmm(16, Bsize=4)/forward/GPU/oneAPI
71655631.5
ns73206765
ns0.98
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU
87071
ns89171
ns0.98
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
5375.5
ns4875
ns1.10
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
5083.5
ns4896
ns1.04
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
5459
ns5250
ns1.04
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
4834
ns4542
ns1.06
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
200684.5
ns200612
ns1.00
batchedmm(16, Bsize=4)/zygote/GPU/oneAPI
92849344
ns94501114
ns0.98
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU
389014
ns394774
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
222083
ns221875
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
223166
ns223209
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
224916.5
ns225917
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
227000
ns223750
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
219523
ns216221
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
7712821.5
ns7634874
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
274002.5
ns277862
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
495292
ns535958
ns0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
549771
ns499104
ns1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
507520.5
ns510167
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
497583
ns508166
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1034369
ns1024022
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
42519004
ns45569833
ns0.93
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
850318.5
ns864044
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19708
ns25166
ns0.78
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
21375
ns20166.5
ns1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22292
ns21750
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24792
ns19167
ns1.29
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
111603.5
ns111455.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3581394.5
ns3479193
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
77006
ns78821
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
218812
ns245354
ns0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
213041.5
ns223375
ns0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221958.5
ns225417
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
250667
ns218541
ns1.15
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
710892
ns707911
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
24867084.5
ns25617389
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
532655
ns538875
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5959
ns7125
ns0.84
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6917
ns6250
ns1.11
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
8708
ns8666
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5917
ns6458
ns0.92
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
131648
ns132297.5
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
5786966.5
ns5594794
ns1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
65661
ns67671
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10584
ns10583
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10729.5
ns10250
ns1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11541
ns10958
ns1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10541
ns10875
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
772200
ns778959.5
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
37330612
ns37279902
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
385494
ns393784
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4833
ns5250
ns0.92
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6354.5
ns6167
ns1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6604.5
ns7583
ns0.87
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5041
ns5208
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
133064
ns134141.5
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
5822443
ns5548829
ns1.05
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
57140
ns69361
ns0.82
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7209
ns7834
ns0.92
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7666
ns7667
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8042
ns8125
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7500
ns7458
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
738153
ns742994
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
40138762.5
ns37148580
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
395034
ns400934
ns0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
14423167
ns14518042
ns0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
10121834
ns10053875
ns1.01
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
7695041.5
ns7724104
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
27731208
ns27741083
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA
530060
ns554321.5
ns0.96
batchedmm(128, Bsize=512)/forward/GPU/oneAPI
94502665
ns94275820
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU
400144
ns399814.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
46295271.5
ns46185458.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
33585729.5
ns33419604
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
26523271
ns26602708.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
85105834
ns85208959
ns1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2636621
ns2813842
ns0.94
batchedmm(128, Bsize=512)/zygote/GPU/oneAPI
190779173
ns194819687
ns0.98
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU
3293333
ns3323814
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
67125
ns69583
ns0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
68791
ns66979
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
69875
ns70292
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
67541
ns67625
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
116341
ns102627
ns1.13
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3481863
ns3515302.5
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
238303
ns232062
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
467979.5
ns520062.5
ns0.90
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
468833
ns473208
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
479729
ns482063
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
467333.5
ns474708
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
704065
ns703393
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
26310960
ns26797269
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
795648
ns793873
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
542
ns500
ns1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns541
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
32111
ns31962
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI
1221683
ns1180122
ns1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
47180
ns47320
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8375
ns8583
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9417
ns9583.5
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9584
ns9541
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8416
ns9667
ns0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
277435.5
ns278738.5
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI
20099617
ns21728099.5
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
375813.5
ns381274
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
9459
ns9666
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
9625
ns9459
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
9708
ns9667
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
9625
ns9666
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
22950
ns23100
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI
2089156.5
ns2057483
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU
212492
ns212922
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
50167
ns50458
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
50292
ns50875
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
50541
ns50375
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
50375
ns50209
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
272026
ns273986
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI
11125411
ns11648854
ns0.96
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
611216
ns610646
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
55250
ns54917
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
55917
ns55708
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
54375
ns54292
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
56041
ns55875
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
27749
ns27572
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1229944.5
ns1222185
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
214587
ns206592
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
485479
ns522166
ns0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
496084
ns504250
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
537000.5
ns503500
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
461291.5
ns472833.5
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
237315
ns236683
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
32908722.5
ns32890414.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
839118
ns889849
ns0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
651166.5
ns653833
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
645917
ns639812.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
662000
ns654166.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
641417
ns643729
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
190601
ns186765
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8668801
ns8191594
ns1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
229822
ns303073
ns0.76
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2241917
ns2228375
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2232875
ns2240916.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2250458.5
ns2265312.5
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2234417
ns2228084
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
914905
ns907493
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
49141404
ns49570533.5
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1359913
ns1227082.5
ns1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
21375
ns22083
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
20938
ns21333
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22583
ns21416.5
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
19167
ns20208
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
109650
ns108981.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
3622083
ns3615898
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
75660
ns81661
ns0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
218833.5
ns232104.5
ns0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
221084
ns222250
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
235688
ns228583
ns1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
221125.5
ns259708
ns0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
709252
ns701359
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
25088612.5
ns27641264
ns0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
553695
ns557775.5
ns0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
500
ns500
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
542
ns541
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23372.5
ns22562
ns1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI
1180770.5
ns1174965
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
49900
ns48641
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9874.5
ns9896
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9708
ns10166
ns0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10229.5
ns9979.5
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9334
ns10646
ns0.88
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
259739
ns259541
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI
25804898
ns25096956
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
401304
ns406314
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
9541
ns10000
ns0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9187.5
ns8875
ns1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10833
ns10333
ns1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8875
ns9625
ns0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
113457.5
ns114946
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI
3378008
ns3356422
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
69850
ns75001
ns0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7625
ns7312.5
ns1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7937.5
ns7833
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8375
ns7833
ns1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7541.5
ns7645.5
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
474853
ns479855
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI
17576598
ns17554055
ns1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
322123
ns327064
ns0.98
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1500
ns1375
ns1.09
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1666.5
ns1834
ns0.91
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2187.5
ns2125
ns1.03
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1542
ns1708
ns0.90
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
19317
ns19733
ns0.98
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI
1172938.5
ns1143637.5
ns1.03
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU
192092
ns192542
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
3542
ns3542
ns1
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
3625
ns3584
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
3833
ns3875
ns0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
3500
ns3500
ns1
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
209093.5
ns210034.5
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI
10006581.5
ns10599117
ns0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
581056
ns584616
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
148416
ns148333.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
127541.5
ns129000
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
107500
ns107396
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
225042
ns233604.5
ns0.96
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
22459
ns23312
ns0.96
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI
1201113
ns1181923
ns1.02
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU
37415.5
ns41095.5
ns0.91
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
143666.5
ns161208.5
ns0.89
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
110916
ns140708
ns0.79
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
100875
ns104000
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
250834
ns259375
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
206476
ns208046
ns0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI
10778609
ns11091691.5
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU
220822
ns267983
ns0.82
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7334
ns7270.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6000
ns5959
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5375
ns5333
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10041
ns9959
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33038
ns32872
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
1161067.5
ns1199319
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
48271
ns50331
ns0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
220021
ns258729
ns0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
227708
ns234500
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
243333
ns238125
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
212750
ns253021
ns0.84
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
256906
ns256256.5
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
27263274.5
ns27890996
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
522055
ns595296
ns0.88
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12333
ns13000
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
13020.5
ns12396
ns1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
14333.5
ns14500
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
12917
ns12500
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
131126.5
ns131871
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI
5521631
ns5626771
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
235402
ns236102
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24520.5
ns23854.5
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24187
ns24500
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25354.5
ns25187.5
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
23625
ns24750
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
816371.5
ns821231
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI
39369345
ns40073814
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
684572
ns689137
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
9208
ns9167
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
10042
ns9834
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
11167
ns11417
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
9625
ns8999.5
ns1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
116949.5
ns119274.5
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI
3478536
ns3523753.5
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
70201
ns76811
ns0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14250
ns14083
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13771
ns14166.5
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15416
ns15104
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13958
ns14083
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
627909.5
ns630553.5
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI
21438120
ns21897908
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
377354
ns373463
ns1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
8958
ns9021
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10437.5
ns9875
ns1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
11750
ns11250
ns1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9166
ns9750
ns0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
115964
ns117966.5
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI
3401614.5
ns3400750
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
72371
ns77501
ns0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13208
ns12854
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12854
ns12937
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13958
ns13187.5
ns1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12416
ns13166
ns0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
516349.5
ns522874
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI
19477250
ns19612958
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
339683.5
ns349524
ns0.97
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
30291.5
ns30958.5
ns0.98
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
34041.5
ns34895.5
ns0.98
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
30042
ns30208
ns0.99
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
2083
ns2042
ns1.02
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16187
ns16552
ns0.98
batchedmm(2, Bsize=128)/forward/GPU/oneAPI
75928615
ns76609794
ns0.99
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU
78561
ns87451
ns0.90
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
5291.5
ns5375
ns0.98
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
5499.5
ns5229
ns1.05
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
5375
ns5395.5
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
6375
ns6417
ns0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
135964
ns135958
ns1.00
batchedmm(2, Bsize=128)/zygote/GPU/oneAPI
110752109
ns111332262.5
ns0.99
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU
382864
ns390584
ns0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
291
ns291
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
417
ns375
ns1.11
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns291
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
24855
ns24266
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
1239551
ns1220615
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
48910
ns49051
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6459
ns6458
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6604
ns6792
ns0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7208.5
ns6875
ns1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6125
ns6375
ns0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
180794
ns181716
ns0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
24106911.5
ns22738910
ns1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
390139
ns394694
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
2000
ns2000
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
2125
ns2125
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
2125
ns2125
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
2042
ns1959
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
25818
ns25193
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI
1193002
ns1233759.5
ns0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
219547
ns207422
ns1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17500.5
ns16937.5
ns1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17833.5
ns17583
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18437.5
ns17666
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17500
ns17167
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
264425
ns266060
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI
24505308
ns25037224.5
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
705652
ns702687
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
178208
ns177959
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
165145.5
ns151000
ns1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
179042
ns151250
ns1.18
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
151292
ns156666
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
187400
ns185813
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
7801096
ns8186035
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
191502
ns213762
ns0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1317104
ns1294417
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1320125
ns1322667
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1331937
ns1326979.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1318125.5
ns1325125
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
859849
ns850017
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
43918638
ns46207436
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1005140
ns1106552
ns0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24084
ns25687.5
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24708
ns25000
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
28063
ns27125
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
26291.5
ns27375
ns0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
226248
ns226385
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI
8086333
ns7541451
ns1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
115141
ns115741
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
160416.5
ns180771
ns0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
132958
ns134583.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
127937.5
ns175167
ns0.73
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
124437.5
ns164479
ns0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
978646
ns971603.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI
45755327
ns45326263
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
587856
ns614401.5
ns0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
250
ns292
ns0.86
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
22971
ns22475
ns1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI
1181802
ns1258351.5
ns0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
48630
ns48960
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6333
ns6458.5
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6729.5
ns6875
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7291
ns6875
ns1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6541.5
ns6458.5
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
197400
ns197699
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI
24703832
ns25220935
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
392804
ns395854
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5584
ns5666
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6958
ns6542
ns1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
8021
ns6416
ns1.25
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5875
ns6167
ns0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
135487.5
ns136571.5
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
5687030
ns5759376
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
235072
ns236832
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10083.5
ns10167
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10458.5
ns10250
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10500
ns10708.5
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9833.5
ns10021
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
841087.5
ns843659.5
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
41023608
ns42177959
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
675251.5
ns680842
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
708
ns708
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
667
ns708
ns0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
750
ns750
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
708
ns667
ns1.06
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
22206
ns22622
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI
2616381.5
ns2092408
ns1.25
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU
209832
ns211377.5
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4875
ns4958
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4917
ns5167
ns0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5208
ns5125
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4834
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
215367.5
ns217676
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI
11863776
ns10379046
ns1.14
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
591926
ns586156
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7729.5
ns7646
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7958
ns8458
ns0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9833
ns10000.5
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7750.5
ns8625
ns0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
115622
ns117310.5
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI
3536818
ns3542404
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
71851
ns77011
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8542
ns8167
ns1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8687.5
ns8792
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9520.5
ns9541
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8520.5
ns8500
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
552863
ns559897.5
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI
20606879
ns21100984
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
343673.5
ns351894
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
127854
ns129875
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
128834
ns131334
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
96354
ns98500
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
183167
ns183000
ns1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA
45982
ns45933
ns1.00
batchedmm(128, Bsize=4)/forward/GPU/oneAPI
72286847
ns73470628
ns0.98
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU
95811
ns104986
ns0.91
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
330459
ns320833
ns1.03
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
332334
ns340500
ns0.98
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
197417
ns196229
ns1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
571042
ns614646
ns0.93
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
183822.5
ns184661
ns1.00
batchedmm(128, Bsize=4)/zygote/GPU/oneAPI
93731117
ns95503191
ns0.98
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU
473290
ns520426
ns0.91
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397125
ns397833
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288229
ns287792
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
215375
ns215167
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
756375
ns756459
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
43348
ns43884
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI
1384285
ns1380208.5
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU
79971
ns82001
ns0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1459375
ns1449083
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1132396
ns1131416
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
862770.5
ns862375
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2442500
ns2444146
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
239777
ns248740
ns0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI
13231788
ns11082909
ns1.19
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU
351138.5
ns350333
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
647458
ns652083
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
649666
ns652854
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
655021
ns654417
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
641583.5
ns661125
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
178508
ns184615
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8381344
ns8038741
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
240322
ns311568
ns0.77
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2454875
ns2443958.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2450333
ns2461416.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2461646
ns2443812.5
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2458334
ns2444771
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
938639.5
ns932610
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
52014786
ns51927904
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1448719
ns1324133
ns1.09
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
33146
ns34083.5
ns0.97
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
35708
ns36437.5
ns0.98
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
32000
ns33771
ns0.95
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
875
ns834
ns1.05
batchedmm(2, Bsize=32)/forward/GPU/CUDA
15683
ns15954
ns0.98
batchedmm(2, Bsize=32)/forward/GPU/oneAPI
73122838
ns74465713
ns0.98
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU
71645.5
ns84121
ns0.85
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
3187.5
ns3042
ns1.05
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
3458
ns3208
ns1.08
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
3541
ns3416
ns1.04
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
3083
ns3084
ns1.00
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
134592
ns134871
ns1.00
batchedmm(2, Bsize=32)/zygote/GPU/oneAPI
97284653
ns101832238
ns0.96
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU
337323.5
ns355194
ns0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
439375
ns435000
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
440583
ns441208
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
431375
ns431291
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
450375
ns449458
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
42224
ns42183
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1392161
ns1418032
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
237893
ns241737
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4138958
ns4139000
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4247291.5
ns4281375
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4262792
ns4272125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4028416.5
ns4043500
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
233746
ns231383.5
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
36534446
ns38875009
ns0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1234322
ns1238087.5
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3709
ns3917
ns0.95
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3917
ns3750
ns1.04
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3958
ns3917
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3916
ns3916
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
34090
ns34290
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI
1239089
ns1242809
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU
40520
ns40730
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15291
ns15750
ns0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15958
ns15500
ns1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15750
ns15708
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15667
ns15667
ns1
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
251120.5
ns253133
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI
8891050
ns8969271
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU
171192
ns178362
ns0.96
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
404125
ns404000
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
295250
ns295666
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
220625
ns221167
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
760666
ns760500
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113428
ns113399
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI
1051037
ns1019290
ns1.03
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU
89110.5
ns89320
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1479125
ns1474312.5
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1156270.5
ns1157021
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
886792
ns884958
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2464333
ns2465875
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
227639.5
ns244167
ns0.93
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI
12228324
ns11671477
ns1.05
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU
352474
ns354019
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
500
ns500
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
625
ns666
ns0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
625
ns584
ns1.07
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
542
ns500
ns1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
24868
ns24808
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI
1263047
ns1214092.5
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
214292
ns210112
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7541
ns7916
ns0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
7917
ns8167
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8250
ns8125
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7667
ns7459
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
202491
ns203590.5
ns0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI
25565257
ns24613685
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
687187
ns690937
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
830417
ns832166.5
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
617334
ns619583
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
467125
ns472250
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
1539875
ns1542500
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA
130469
ns130624
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/oneAPI
74138060
ns75509279
ns0.98
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU
167662
ns236082
ns0.71
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
2680895.5
ns2694208.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1979750
ns1991375
ns0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1532167
ns1537625
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
4935708
ns4930000
ns1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
233179
ns233850
ns1.00
batchedmm(128, Bsize=32)/zygote/GPU/oneAPI
101283369
ns102808354
ns0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU
855698
ns768638
ns1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns250
ns1.17
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31956
ns31761
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI
1162026.5
ns1224489
ns0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
49090
ns47050
ns1.04
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6187
ns6417
ns0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6770.5
ns6792
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7042
ns6666
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6375
ns6375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
217529.5
ns219075.5
ns0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI
22613407
ns23474742
ns0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
355723.5
ns362424
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1750042
ns1776458
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1774250
ns1755459
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1759417
ns1754000
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1775625
ns1755666
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
177451
ns183229.5
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
8059544
ns8315915
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
355403
ns375104
ns0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4352125
ns4353771
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4360770.5
ns4398479
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4377083.5
ns4376083
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4357583
ns4351333
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
843625
ns833369
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
47645217
ns47106002
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1390698
ns1251643
ns1.11
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
14562.5
ns7083.5
ns2.06
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
9667
ns7104
ns1.36
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
8292
ns7375
ns1.12
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6666.5
ns6834
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
22207
ns22695
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI
1231018
ns1216626
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU
37720
ns37200
ns1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
64458.5
ns48479.5
ns1.33
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
70792
ns50874.5
ns1.39
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
45708
ns47979
ns0.95
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
49521
ns47208
ns1.05
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
204835
ns207872
ns0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI
10627124.5
ns10801241
ns0.98
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU
233202
ns234813
ns0.99
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
21292
ns22854
ns0.93
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
24770.5
ns26375
ns0.94
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
22334
ns23146
ns0.96
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
7416
ns5333
ns1.39
batchedmm(2, Bsize=512)/forward/GPU/CUDA
17630
ns17805
ns0.99
batchedmm(2, Bsize=512)/forward/GPU/oneAPI
87889435
ns89168517
ns0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU
90301
ns90691
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
12187
ns12083
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
10625
ns10208.5
ns1.04
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
9750
ns9583
ns1.02
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
18041.5
ns18104.5
ns1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
216733.5
ns217973
ns0.99
batchedmm(2, Bsize=512)/zygote/GPU/oneAPI
151365483
ns150119195
ns1.01
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU
384574
ns389829
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
405417
ns405958
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
297333
ns297166.5
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
223417
ns223625
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
762625
ns762167
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
46368
ns46720
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI
1390104
ns1360027
ns1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU
90091
ns90521
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1487792
ns1491042
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1159187.5
ns1165750
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
892375
ns892791.5
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2470895.5
ns2470333
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
267416
ns279542.5
ns0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI
13880635
ns11213824.5
ns1.24
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU
378473.5
ns375414
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
436458
ns436000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
438916
ns440750
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
431708
ns432000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
450167
ns449042
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
53539
ns54332
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
1016245
ns999725
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
235682
ns237743
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4143041
ns4137041.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4257999.5
ns4271042
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4266292
ns4270646
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4032437.5
ns4030959
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
253837
ns253348
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
31122046.5
ns32411933.5
ns0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1206682
ns1223273
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
9208
ns9458
ns0.97
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
8167
ns8000
ns1.02
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
7208
ns7209
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
13416
ns13458
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
23370
ns24044
ns0.97
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI
2190811
ns2135292
ns1.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU
212852
ns214732
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
49416
ns49833
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
50083
ns49750
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
49541
ns49458
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
49667
ns49500
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
331181
ns335918.5
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI
12227793
ns12693187
ns0.96
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
657676
ns656617
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
123458
ns136583
ns0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
85271
ns82145.5
ns1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
127292
ns85583
ns1.49
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
108541.5
ns83104
ns1.31
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
191180.5
ns191318.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI
6110005
ns5843078
ns1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
200667
ns205972
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2014999.5
ns2013959
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1877583
ns2017792
ns0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2016083
ns2022958
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2015916
ns2019333
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
510301
ns508706
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI
27606531
ns28081381
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
943229
ns1089431
ns0.87
This comment was automatically generated by workflow using github-action-benchmark.