Skip to content

Commit

Permalink
CM7: Simplify Dilithium iNTT code
Browse files Browse the repository at this point in the history
* This commit simplifies the Dilithium iNTT naive implementations to revert modifications to the code originally taken from pqm4 that were only introduced to accomodate for shortcomings of slothy's abilities.
* We can also enable the fixup in more cases due to switching of the loop-type + using `before` tag which is done here, too. This aids with performance.
  • Loading branch information
dop-amin committed Jan 10, 2025
1 parent 0f18e13 commit 630bf95
Show file tree
Hide file tree
Showing 3 changed files with 869 additions and 847 deletions.
6 changes: 3 additions & 3 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,7 +1605,7 @@ def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=Non
def core(self, slothy):
slothy.config.constraints.stalls_first_attempt = 16

slothy.config.unsafe_address_offset_fixup = False
slothy.config.unsafe_address_offset_fixup = True


slothy.config.variable_size = True
Expand All @@ -1616,12 +1616,12 @@ def core(self, slothy):
slothy.config.sw_pipelining.optimize_postamble = True
slothy.config.sw_pipelining.allow_pre = True

slothy.optimize_loop("layer123_loop")
slothy.optimize_loop("layer123_loop", forced_loop_type=Arch_Armv7M.BranchLoop)
slothy.optimize_loop("layer456_first_loop")
slothy.optimize_loop("layer456_loop")

slothy.config.inputs_are_outputs = True
slothy.optimize_loop("layer78_loop")
slothy.optimize_loop("layer78_loop", forced_loop_type=Arch_Armv7M.BranchLoop)

class pointwise_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
Expand Down
68 changes: 34 additions & 34 deletions examples/naive/armv7m/intt_dilithium_123_456_78.s
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,9 @@ pqcrystals_dilithium_invntt_tomont:
str.w pol5, [ptr_p, #5*distance/4]
str.w pol6, [ptr_p, #6*distance/4]
str.w pol7, [ptr_p, #7*distance/4]
str.w pol0, [ptr_p], #strincr
str.w pol0, [ptr_p], #strincr // @slothy:before=cmp
vmov temp_l, s9
cmp.w ptr_p, temp_l
cmp.w ptr_p, temp_l // @slothy:id=cmp
bne.w layer123_loop

sub ptr_p, #32*strincr
Expand All @@ -248,21 +248,21 @@ pqcrystals_dilithium_invntt_tomont:
ldr.w pol3, [ptr_p, #7*distance2/4]
_3_layer_inv_butterfly_light_fast_first pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l

ldr.w pol0, [ptr_p], #128
ldr pol1, [ptr_p, #1*distance2/4-128]
ldr pol2, [ptr_p, #2*distance2/4-128]
ldr pol3, [ptr_p, #3*distance2/4-128]
ldr.w pol0, [ptr_p]
ldr pol1, [ptr_p, #1*distance2/4]
ldr pol2, [ptr_p, #2*distance2/4]
ldr pol3, [ptr_p, #3*distance2/4]
_3_layer_inv_butterfly_light_fast_second pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l

str pol1, [ptr_p, #1*distance2/4-128]
str pol2, [ptr_p, #2*distance2/4-128]
str pol3, [ptr_p, #3*distance2/4-128]
str.w pol5, [ptr_p, #5*distance2/4-128]
str.w pol6, [ptr_p, #6*distance2/4-128]
str.w pol7, [ptr_p, #7*distance2/4-128]
str pol0, [ptr_p, #-128]
str.w pol4, [ptr_p], #128
//add.w ptr_p, #strincr2
str pol1, [ptr_p, #1*distance2/4]
str pol2, [ptr_p, #2*distance2/4]
str pol3, [ptr_p, #3*distance2/4]
str.w pol4, [ptr_p, #4*distance2/4]
str.w pol5, [ptr_p, #5*distance2/4]
str.w pol6, [ptr_p, #6*distance2/4]
str.w pol7, [ptr_p, #7*distance2/4]
str pol0, [ptr_p]
add.w ptr_p, ptr_p, #strincr2

vmov temp_l, s10
cmp.w ptr_p, temp_l
Expand All @@ -281,26 +281,26 @@ pqcrystals_dilithium_invntt_tomont:
vldm ptr_zeta!, {s2-s8}
vmov s0, ptr_zeta
layer456_loop:
ldr.w pol0, [ptr_p], #128
ldr pol1, [ptr_p, #1*distance2/4-128]
ldr pol2, [ptr_p, #2*distance2/4-128]
ldr pol3, [ptr_p, #3*distance2/4-128]
ldr.w pol4, [ptr_p, #4*distance2/4-128]
ldr.w pol5, [ptr_p, #5*distance2/4-128]
ldr.w pol6, [ptr_p, #6*distance2/4-128]
ldr.w pol7, [ptr_p, #7*distance2/4-128]
ldr.w pol0, [ptr_p]
ldr pol1, [ptr_p, #1*distance2/4]
ldr pol2, [ptr_p, #2*distance2/4]
ldr pol3, [ptr_p, #3*distance2/4]
ldr.w pol4, [ptr_p, #4*distance2/4]
ldr.w pol5, [ptr_p, #5*distance2/4]
ldr.w pol6, [ptr_p, #6*distance2/4]
ldr.w pol7, [ptr_p, #7*distance2/4]

_3_layer_inv_CT_32 pol0, pol1, pol2, pol3, pol4, pol5, pol6, pol7, s2, s3, s4, s5, s6, s7, s8, zeta, qinv, q, temp_h, temp_l

str pol1, [ptr_p, #1*distance2/4-128]
str pol2, [ptr_p, #2*distance2/4-128]
str pol3, [ptr_p, #3*distance2/4-128]
str.w pol5, [ptr_p, #5*distance2/4-128]
str.w pol6, [ptr_p, #6*distance2/4-128]
str.w pol7, [ptr_p, #7*distance2/4-128]
str pol0, [ptr_p, #-128]
str.w pol4, [ptr_p], #128
//add.w ptr_p, #strincr2
str pol1, [ptr_p, #1*distance2/4]
str pol2, [ptr_p, #2*distance2/4]
str pol3, [ptr_p, #3*distance2/4]
str.w pol4, [ptr_p, #4*distance2/4]
str.w pol5, [ptr_p, #5*distance2/4]
str.w pol6, [ptr_p, #6*distance2/4]
str.w pol7, [ptr_p, #7*distance2/4]
str pol0, [ptr_p]
add.w ptr_p, ptr_p, #strincr2

vmov temp_l, s10
cmp.w ptr_p, temp_l
Expand Down Expand Up @@ -342,10 +342,10 @@ pqcrystals_dilithium_invntt_tomont:
str.w pol1, [ptr_p, #256]
str.w pol2, [ptr_p, #512]
str.w pol3, [ptr_p, #768]
str pol0, [ptr_p], #strincr3 // @slothy:core
str pol0, [ptr_p], #strincr3 // @slothy:core // @slothy:before=cmp

vmov cntr, s9
cmp.w ptr_p, cntr
cmp.w ptr_p, cntr // @slothy:id=cmp
bne.w layer78_loop

//restore registers
Expand Down
Loading

0 comments on commit 630bf95

Please sign in to comment.