From c2b610a3fb34fed80e35901168887e4b95ebe781 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 9 Jan 2025 11:39:50 +1000 Subject: [PATCH] Poly1305 ARM32 NEON: add implementation Add assembly for Poly1305 using ARM32 NEON instruction set. For Poly1305 ARM32 Base: Change name from poly1305_blocks_arm32_16 to poly1305_arm32_blocks_16 poly1305.c: ARM32 NEON - buffer up to 4 blocks x86_64 - only calculate powers of r once after key is set. test.c: poly1305 testing with multiple updates. benchmark: chacha20-poly1305 now uses AAD --- wolfcrypt/benchmark/benchmark.c | 17 +- wolfcrypt/src/poly1305.c | 48 +- .../src/port/arm/armv8-32-poly1305-asm.S | 944 ++++++++++++++++- .../src/port/arm/armv8-32-poly1305-asm_c.c | 973 +++++++++++++++++- wolfcrypt/src/port/arm/armv8-poly1305.c | 20 +- wolfcrypt/test/test.c | 25 + wolfssl/wolfcrypt/poly1305.h | 13 +- 7 files changed, 2024 insertions(+), 16 deletions(-) diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c index fa18aa0bbd..aa9e47fe7d 100644 --- a/wolfcrypt/benchmark/benchmark.c +++ b/wolfcrypt/benchmark/benchmark.c @@ -770,7 +770,8 @@ #define BENCH_RNG 0x00000001 #define BENCH_SCRYPT 0x00000002 -#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) +#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) || \ + (defined(HAVE_CHACHA) && defined(HAVE_POLY1305)) /* Define AES_AUTH_ADD_SZ already here, since it's used in the * static declaration of `bench_Usage_msg1`. */ #if !defined(AES_AUTH_ADD_SZ) && \ @@ -1945,10 +1946,13 @@ static const char* bench_result_words2[][5] = { #define BENCH_MIN_RUNTIME_SEC 1.0F #endif +#if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) || \ + (defined(HAVE_CHACHA) && defined(HAVE_POLY1305)) + static word32 aesAuthAddSz = AES_AUTH_ADD_SZ; +#endif #if defined(HAVE_AESGCM) || defined(HAVE_AESCCM) #define AES_AUTH_TAG_SZ 16 #define BENCH_CIPHER_ADD AES_AUTH_TAG_SZ - static word32 aesAuthAddSz = AES_AUTH_ADD_SZ; #if !defined(AES_AAD_OPTIONS_DEFAULT) #if !defined(NO_MAIN_DRIVER) #define AES_AAD_OPTIONS_DEFAULT 0x1U @@ -6059,15 +6063,19 @@ void bench_chacha20_poly1305_aead(void) int ret = 0, i, count; DECLARE_MULTI_VALUE_STATS_VARS() + WC_DECLARE_VAR(bench_additional, byte, AES_AUTH_ADD_SZ, HEAP_HINT); WC_DECLARE_VAR(authTag, byte, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE, HEAP_HINT); + WC_ALLOC_VAR(bench_additional, byte, AES_AUTH_ADD_SZ, HEAP_HINT); WC_ALLOC_VAR(authTag, byte, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE, HEAP_HINT); + XMEMSET(bench_additional, 0, AES_AUTH_ADD_SZ); XMEMSET(authTag, 0, CHACHA20_POLY1305_AEAD_AUTHTAG_SIZE); bench_stats_start(&count, &start); do { for (i = 0; i < numBlocks; i++) { - ret = wc_ChaCha20Poly1305_Encrypt(bench_key, bench_iv, NULL, 0, - bench_plain, bench_size, bench_cipher, authTag); + ret = wc_ChaCha20Poly1305_Encrypt(bench_key, bench_iv, + bench_additional, aesAuthAddSz, bench_plain, bench_size, + bench_cipher, authTag); if (ret < 0) { printf("wc_ChaCha20Poly1305_Encrypt error: %d\n", ret); goto exit; @@ -6089,6 +6097,7 @@ void bench_chacha20_poly1305_aead(void) exit: WC_FREE_VAR(authTag, HEAP_HINT); + WC_FREE_VAR(bench_additional, HEAP_HINT); } #endif /* HAVE_CHACHA && HAVE_POLY1305 */ diff --git a/wolfcrypt/src/poly1305.c b/wolfcrypt/src/poly1305.c index 718289c4fd..48d9001dd8 100644 --- a/wolfcrypt/src/poly1305.c +++ b/wolfcrypt/src/poly1305.c @@ -529,6 +529,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) #endif poly1305_setkey_avx(ctx, key); RESTORE_VECTOR_REGISTERS(); + ctx->started = 0; #elif defined(POLY130564) /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ @@ -813,13 +814,49 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) printf("\n"); #endif +#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_THUMB2) && \ + !defined(WOLFSSL_ARMASM_NO_NEON) + /* handle leftover */ + if (ctx->leftover) { + size_t want = sizeof(ctx->buffer) - ctx->leftover; + if (want > bytes) + want = bytes; + + for (i = 0; i < want; i++) + ctx->buffer[ctx->leftover + i] = m[i]; + bytes -= (word32)want; + m += want; + ctx->leftover += want; + if (ctx->leftover < sizeof(ctx->buffer)) { + return 0; + } + + poly1305_blocks(ctx, ctx->buffer, sizeof(ctx->buffer)); + ctx->leftover = 0; + } + + /* process full blocks */ + if (bytes >= sizeof(ctx->buffer)) { + size_t want = bytes & ~((size_t)POLY1305_BLOCK_SIZE - 1); + + poly1305_blocks(ctx, m, want); + m += want; + bytes -= (word32)want; + } + + /* store leftover */ + if (bytes) { + for (i = 0; i < bytes; i++) + ctx->buffer[ctx->leftover + i] = m[i]; + ctx->leftover += bytes; + } +#else #ifdef USE_INTEL_POLY1305_SPEEDUP #ifdef HAVE_INTEL_AVX2 if (IS_INTEL_AVX2(intel_flags)) { SAVE_VECTOR_REGISTERS(return _svr_ret;); /* handle leftover */ - if (ctx->leftover) { size_t want = sizeof(ctx->buffer) - ctx->leftover; if (want > bytes) @@ -835,8 +872,10 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) return 0; } - if (!ctx->started) + if (!ctx->started) { poly1305_calc_powers_avx2(ctx); + ctx->started = 1; + } poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer)); ctx->leftover = 0; } @@ -845,8 +884,10 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) if (bytes >= sizeof(ctx->buffer)) { size_t want = bytes & ~(sizeof(ctx->buffer) - 1); - if (!ctx->started) + if (!ctx->started) { poly1305_calc_powers_avx2(ctx); + ctx->started = 1; + } poly1305_blocks_avx2(ctx, m, want); m += want; bytes -= (word32)want; @@ -902,6 +943,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) ctx->leftover += bytes; } } +#endif return 0; } diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S index 731836b9e8..12dab53412 100644 --- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S @@ -34,11 +34,12 @@ #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2) #ifndef WOLFSSL_ARMASM_INLINE #ifdef HAVE_POLY1305 +#ifdef WOLFSSL_ARMASM_NO_NEON .text .align 4 - .globl poly1305_blocks_arm32_16 - .type poly1305_blocks_arm32_16, %function -poly1305_blocks_arm32_16: + .globl poly1305_arm32_blocks_16 + .type poly1305_arm32_blocks_16, %function +poly1305_arm32_blocks_16: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #28 cmp r2, #0 @@ -247,7 +248,7 @@ L_poly1305_arm32_16_loop: L_poly1305_arm32_16_done: add sp, sp, #28 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} - .size poly1305_blocks_arm32_16,.-poly1305_blocks_arm32_16 + .size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16 .text .type L_poly1305_arm32_clamp, %object .size L_poly1305_arm32_clamp, 16 @@ -347,6 +348,941 @@ poly1305_final: stm r9, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, pc} .size poly1305_final,.-poly1305_final +#else + .text + .align 4 + .globl poly1305_arm32_blocks_16 + .type poly1305_arm32_blocks_16, %function +poly1305_arm32_blocks_16: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub sp, sp, #28 + cmp r2, #0 + beq L_poly1305_arm32_16_done + add lr, sp, #12 + stm lr, {r0, r1, r2, r3} + # Get h pointer + add lr, r0, #16 + ldm lr, {r4, r5, r6, r7, r8} +L_poly1305_arm32_16_loop: + # Add m to h + ldr r1, [sp, #16] + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r9, [r1, #8] + ldr r10, [r1, #12] + ldr r11, [sp, #24] + adds r4, r4, r2 + adcs r5, r5, r3 + adcs r6, r6, r9 + adcs r7, r7, r10 + add r1, r1, #16 + adc r8, r8, r11 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + stm lr, {r4, r5, r6, r7, r8} +#else + # h[0]-h[2] in r4-r6 for multiplication. + str r7, [lr, #12] + str r8, [lr, #16] +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + str r1, [sp, #16] + ldr r1, [sp, #12] + # Multiply h by r +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] + ldr r3, [r1] + eor r0, r0, r0 + # r[0] * h[0] + # h[0] in r4 + umull r4, r5, r3, r4 + # r[0] * h[2] + # h[2] in r6 + umull r6, r7, r3, r6 + # r[0] * h[4] + # h[4] in r8 + mul r8, r3, r8 + # r[0] * h[1] + ldr r2, [lr, #4] + mov r12, r0 + umlal r5, r12, r3, r2 + # r[0] * h[3] + ldr r2, [lr, #12] + adds r6, r6, r12 + adc r7, r7, r0 + umlal r7, r8, r3, r2 + # r[1] * h[0] + ldr r3, [r1, #4] + ldr r2, [lr] + mov r12, r0 + umlal r5, r12, r3, r2 + # r[1] * h[1] + ldr r2, [lr, #4] + adds r6, r6, r12 + adc r12, r0, r0 + umlal r6, r12, r3, r2 + # r[1] * h[2] + ldr r2, [lr, #8] + adds r7, r7, r12 + adc r12, r0, r0 + umlal r7, r12, r3, r2 + # r[1] * h[3] + ldr r2, [lr, #12] + adds r8, r8, r12 + adc r9, r0, r0 + umlal r8, r9, r3, r2 + # r[1] * h[4] + ldr r2, [lr, #16] + mla r9, r3, r2, r9 + # r[2] * h[0] + ldr r3, [r1, #8] + ldr r2, [lr] + mov r12, r0 + umlal r6, r12, r3, r2 + # r[2] * h[1] + ldr r2, [lr, #4] + adds r7, r7, r12 + adc r12, r0, r0 + umlal r7, r12, r3, r2 + # r[2] * h[2] + ldr r2, [lr, #8] + adds r8, r8, r12 + adc r12, r0, r0 + umlal r8, r12, r3, r2 + # r[2] * h[3] + ldr r2, [lr, #12] + adds r9, r9, r12 + adc r10, r0, r0 + umlal r9, r10, r3, r2 + # r[2] * h[4] + ldr r2, [lr, #16] + mla r10, r3, r2, r10 + # r[3] * h[0] + ldr r3, [r1, #12] + ldr r2, [lr] + mov r12, r0 + umlal r7, r12, r3, r2 + # r[3] * h[1] + ldr r2, [lr, #4] + adds r8, r8, r12 + adc r12, r0, r0 + umlal r8, r12, r3, r2 + # r[3] * h[2] + ldr r2, [lr, #8] + adds r9, r9, r12 + adc r10, r10, r0 + umlal r9, r10, r3, r2 + # r[3] * h[3] + ldr r2, [lr, #12] + mov r11, r0 + umlal r10, r11, r3, r2 + # r[3] * h[4] + ldr r2, [lr, #16] + mov r12, r0 + mla r11, r3, r2, r11 +#else + ldm r1, {r0, r1, r2, r3} + # r[0] * h[0] + umull r10, r11, r0, r4 + # r[1] * h[0] + umull r12, r7, r1, r4 + # r[0] * h[1] + umaal r11, r12, r0, r5 + # r[2] * h[0] + umull r8, r9, r2, r4 + # r[1] * h[1] + umaal r12, r8, r1, r5 + # r[0] * h[2] + umaal r12, r7, r0, r6 + # r[3] * h[0] + umaal r8, r9, r3, r4 + stm sp, {r10, r11, r12} + # r[2] * h[1] + umaal r7, r8, r2, r5 + # Replace h[0] with h[3] + ldr r4, [lr, #12] + # r[1] * h[2] + umull r10, r11, r1, r6 + # r[2] * h[2] + umaal r8, r9, r2, r6 + # r[0] * h[3] + umaal r7, r10, r0, r4 + # r[3] * h[1] + umaal r8, r11, r3, r5 + # r[1] * h[3] + umaal r8, r10, r1, r4 + # r[3] * h[2] + umaal r9, r11, r3, r6 + # r[2] * h[3] + umaal r9, r10, r2, r4 + # Replace h[1] with h[4] + ldr r5, [lr, #16] + # r[3] * h[3] + umaal r10, r11, r3, r4 + mov r12, #0 + # r[0] * h[4] + umaal r8, r12, r0, r5 + # r[1] * h[4] + umaal r9, r12, r1, r5 + # r[2] * h[4] + umaal r10, r12, r2, r5 + # r[3] * h[4] + umaal r11, r12, r3, r5 + # DONE + ldm sp, {r4, r5, r6} +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + # r12 will be zero because r is masked. + # Load length + ldr r2, [sp, #20] + # Reduce mod 2^130 - 5 + bic r3, r8, #0x3 + and r8, r8, #3 + adds r4, r4, r3 + lsr r3, r3, #2 + adcs r5, r5, r9 + orr r3, r3, r9, LSL #30 + adcs r6, r6, r10 + lsr r9, r9, #2 + adcs r7, r7, r11 + orr r9, r9, r10, LSL #30 + adc r8, r8, r12 + lsr r10, r10, #2 + adds r4, r4, r3 + orr r10, r10, r11, LSL #30 + adcs r5, r5, r9 + lsr r11, r11, #2 + adcs r6, r6, r10 + adcs r7, r7, r11 + adc r8, r8, r12 + # Sub 16 from length. + subs r2, r2, #16 + # Store length. + str r2, [sp, #20] + # Loop again if more message to do. + bgt L_poly1305_arm32_16_loop + stm lr, {r4, r5, r6, r7, r8} +L_poly1305_arm32_16_done: + add sp, sp, #28 + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16 + .text + .align 4 + .globl poly1305_arm32_blocks + .type poly1305_arm32_blocks, %function +poly1305_arm32_blocks: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vpush {d8-d15} + cmp r2, #16 + add r12, r0, #16 + bgt L_poly1305_arm32_blocks_begin_neon + ldm r12, {r7, r8, r9, r10, r11} + b L_poly1305_arm32_blocks_start_1 +L_poly1305_arm32_blocks_begin_neon: + vmov.i16 q15, #0xffff + vshr.u64 q15, q15, #38 + vld1.64 {d0-d2}, [r12] + vshl.u64 d4, d2, #24 + vsri.u64 d4, d1, #40 + vshr.u64 d3, d1, #14 + vshl.u64 d2, d1, #12 + vsri.u64 d1, d0, #26 + vsri.u64 d2, d0, #52 + vand.u64 d0, d0, d31 + vand.u64 d3, d3, d31 + vand.u64 d2, d2, d31 + vand.u64 d1, d1, d31 + add r3, r0, #0x7c + vldm.32 r3, {d20-d24} + cmp r2, #0x40 + bge L_poly1305_arm32_blocks_begin_4 + vshl.u32 d6, d21, #2 + vshl.u32 d7, d22, #2 + vshl.u32 d8, d23, #2 + vshl.u32 d9, d24, #2 + vadd.u32 d6, d6, d21 + vadd.u32 d7, d7, d22 + vadd.u32 d8, d8, d23 + vadd.u32 d9, d9, d24 + b L_poly1305_arm32_blocks_start_2 +L_poly1305_arm32_blocks_begin_4: + add r3, r0, #0xa4 + vldm.32 r3, {d26-d30} +L_poly1305_arm32_blocks_start_4: + sub r2, #0x40 + vld4.32 {d10-d13}, [r1]! + vshl.u32 d6, d27, #2 + vshl.u32 d7, d28, #2 + vshl.u32 d8, d29, #2 + vshl.u32 d9, d30, #2 + vadd.u32 d6, d6, d27 + vadd.u32 d7, d7, d28 + vadd.u32 d8, d8, d29 + vadd.u32 d9, d9, d30 + vshr.u32 d14, d13, #8 + vshl.u32 d13, d13, #18 + vorr.i32 d14, d14, #0x1000000 + vsri.u32 d13, d12, #14 + vshl.u32 d12, d12, #12 + vand.i32 d13, d13, #0x3ffffff + vsri.u32 d12, d11, #20 + vshl.u32 d11, d11, #6 + vand.i32 d12, d12, #0x3ffffff + vsri.u32 d11, d10, #26 + vand.i32 d10, d10, #0x3ffffff + vand.i32 d11, d11, #0x3ffffff + vadd.u32 d4, d4, d14 + vadd.u32 q1, q1, q6 + vadd.u32 q0, q0, q5 + vmull.u32 q5, d0, d26 + vmull.u32 q6, d0, d27 + vmull.u32 q7, d0, d28 + vmull.u32 q8, d0, d29 + vmull.u32 q9, d0, d30 + vmlal.u32 q5, d1, d9 + vmlal.u32 q6, d1, d26 + vmlal.u32 q7, d1, d27 + vmlal.u32 q8, d1, d28 + vmlal.u32 q9, d1, d29 + vmlal.u32 q5, d2, d8 + vmlal.u32 q6, d2, d9 + vmlal.u32 q7, d2, d26 + vmlal.u32 q8, d2, d27 + vmlal.u32 q9, d2, d28 + vmlal.u32 q5, d3, d7 + vmlal.u32 q6, d3, d8 + vmlal.u32 q7, d3, d9 + vmlal.u32 q8, d3, d26 + vmlal.u32 q9, d3, d27 + vmlal.u32 q5, d4, d6 + vmlal.u32 q6, d4, d7 + vmlal.u32 q7, d4, d8 + vmlal.u32 q8, d4, d9 + vmlal.u32 q9, d4, d26 + vld4.32 {d0-d3}, [r1]! + vshl.u32 d6, d21, #2 + vshl.u32 d7, d22, #2 + vshl.u32 d8, d23, #2 + vshl.u32 d9, d24, #2 + vadd.u32 d6, d6, d21 + vadd.u32 d7, d7, d22 + vadd.u32 d8, d8, d23 + vadd.u32 d9, d9, d24 + vshr.u32 d4, d3, #8 + vshl.u32 d3, d3, #18 + vorr.i32 d4, d4, #0x1000000 + vsri.u32 d3, d2, #14 + vshl.u32 d2, d2, #12 + vand.i32 d3, d3, #0x3ffffff + vsri.u32 d2, d1, #20 + vshl.u32 d1, d1, #6 + vand.i32 d2, d2, #0x3ffffff + vsri.u32 d1, d0, #26 + vand.i32 d0, d0, #0x3ffffff + vand.i32 d1, d1, #0x3ffffff + vmlal.u32 q5, d0, d20 + vmlal.u32 q6, d0, d21 + vmlal.u32 q7, d0, d22 + vmlal.u32 q8, d0, d23 + vmlal.u32 q9, d0, d24 + vmlal.u32 q5, d1, d9 + vmlal.u32 q6, d1, d20 + vmlal.u32 q7, d1, d21 + vmlal.u32 q8, d1, d22 + vmlal.u32 q9, d1, d23 + vmlal.u32 q5, d2, d8 + vmlal.u32 q6, d2, d9 + vmlal.u32 q7, d2, d20 + vmlal.u32 q8, d2, d21 + vmlal.u32 q9, d2, d22 + vmlal.u32 q5, d3, d7 + vmlal.u32 q6, d3, d8 + vmlal.u32 q7, d3, d9 + vmlal.u32 q8, d3, d20 + vmlal.u32 q9, d3, d21 + vmlal.u32 q5, d4, d6 + vmlal.u32 q6, d4, d7 + vmlal.u32 q7, d4, d8 + vmlal.u32 q8, d4, d9 + vmlal.u32 q9, d4, d20 + vadd.u64 d0, d10, d11 + vadd.u64 d1, d12, d13 + vadd.u64 d2, d14, d15 + vadd.u64 d3, d16, d17 + vadd.u64 d4, d18, d19 + vsra.u64 d1, d0, #26 + vand.u64 d0, d0, d31 + vsra.u64 d2, d1, #26 + vand.u64 d1, d1, d31 + vsra.u64 d3, d2, #26 + vand.u64 d2, d2, d31 + vsra.u64 d4, d3, #26 + vand.u64 d3, d3, d31 + vshr.u64 d15, d4, #26 + vand.u64 d4, d4, d31 + vadd.u64 d0, d0, d15 + vshl.u64 d15, d15, #2 + vadd.u64 d0, d0, d15 + vsra.u64 d1, d0, #26 + vand.u64 d0, d0, d31 + cmp r2, #0x40 + bge L_poly1305_arm32_blocks_start_4 + cmp r2, #32 + blt L_poly1305_arm32_blocks_done_neon +L_poly1305_arm32_blocks_start_2: + sub r2, #32 + vld4.32 {d10-d13}, [r1]! + vshr.u32 d14, d13, #8 + vshl.u32 d13, d13, #18 + vorr.i32 d14, d14, #0x1000000 + vsri.u32 d13, d12, #14 + vshl.u32 d12, d12, #12 + vand.i32 d13, d13, #0x3ffffff + vsri.u32 d12, d11, #20 + vshl.u32 d11, d11, #6 + vand.i32 d12, d12, #0x3ffffff + vsri.u32 d11, d10, #26 + vand.i32 d10, d10, #0x3ffffff + vand.i32 d11, d11, #0x3ffffff + vadd.u32 d4, d4, d14 + vadd.u32 q1, q1, q6 + vadd.u32 q0, q0, q5 + vmull.u32 q5, d0, d20 + vmull.u32 q6, d0, d21 + vmull.u32 q7, d0, d22 + vmull.u32 q8, d0, d23 + vmull.u32 q9, d0, d24 + vmlal.u32 q5, d1, d9 + vmlal.u32 q6, d1, d20 + vmlal.u32 q7, d1, d21 + vmlal.u32 q8, d1, d22 + vmlal.u32 q9, d1, d23 + vmlal.u32 q5, d2, d8 + vmlal.u32 q6, d2, d9 + vmlal.u32 q7, d2, d20 + vmlal.u32 q8, d2, d21 + vmlal.u32 q9, d2, d22 + vmlal.u32 q5, d3, d7 + vmlal.u32 q6, d3, d8 + vmlal.u32 q7, d3, d9 + vmlal.u32 q8, d3, d20 + vmlal.u32 q9, d3, d21 + vmlal.u32 q5, d4, d6 + vmlal.u32 q6, d4, d7 + vmlal.u32 q7, d4, d8 + vmlal.u32 q8, d4, d9 + vmlal.u32 q9, d4, d20 + vadd.u64 d0, d10, d11 + vadd.u64 d1, d12, d13 + vadd.u64 d2, d14, d15 + vadd.u64 d3, d16, d17 + vadd.u64 d4, d18, d19 + vsra.u64 d1, d0, #26 + vand.u64 d0, d0, d31 + vsra.u64 d2, d1, #26 + vand.u64 d1, d1, d31 + vsra.u64 d3, d2, #26 + vand.u64 d2, d2, d31 + vsra.u64 d4, d3, #26 + vand.u64 d3, d3, d31 + vshr.u64 d5, d4, #26 + vand.u64 d4, d4, d31 + vadd.u64 d0, d0, d5 + vshl.u64 d5, d5, #2 + vadd.u64 d0, d0, d5 + vsra.u64 d1, d0, #26 + vand.u64 d0, d0, d31 +L_poly1305_arm32_blocks_done_neon: + cmp r2, #16 + beq L_poly1305_arm32_blocks_begin_1 + add r12, r0, #16 + vsli.u64 d0, d1, #26 + vsli.u64 d0, d2, #52 + vshr.u64 d1, d2, #12 + vsli.u64 d1, d3, #14 + vsli.u64 d1, d4, #40 + vshr.u64 d2, d4, #24 + vst1.64 {d0-d2}, [r12] + b L_poly1305_arm32_blocks_done +L_poly1305_arm32_blocks_begin_1: + vsli.u64 d0, d1, #26 + vsli.u64 d0, d2, #52 + vshr.u64 d1, d2, #12 + vsli.u64 d1, d3, #14 + vsli.u64 d1, d4, #40 + vshr.u64 d2, d4, #24 + vmov r7, r8, d0 + vmov r9, r10, d1 + vmov r11, d2[0] +L_poly1305_arm32_blocks_start_1: + mov r12, #1 + push {r2} + # Load message + ldm r1, {r2, r3, r4, r5} + # Add message + adds r7, r7, r2 + adcs r8, r8, r3 + adcs r9, r9, r4 + adcs r10, r10, r5 + adc r11, r11, r12 + push {r0, r1} + add r1, r0, #0 + add lr, r0, #16 +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + stm lr, {r7, r8, r9, r10, r11} +#else + # h[0]-h[2] in r4-r6 for multiplication. + str r10, [lr, #12] + str r11, [lr, #16] +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] + ldr r3, [r1] + eor r0, r0, r0 + # r[0] * h[0] + # h[0] in r4 + umull r7, r8, r3, r7 + # r[0] * h[2] + # h[2] in r6 + umull r9, r10, r3, r9 + # r[0] * h[4] + # h[4] in r8 + mul r11, r3, r11 + # r[0] * h[1] + ldr r2, [lr, #4] + mov r12, r0 + umlal r8, r12, r3, r2 + # r[0] * h[3] + ldr r2, [lr, #12] + adds r9, r9, r12 + adc r10, r10, r0 + umlal r10, r11, r3, r2 + # r[1] * h[0] + ldr r3, [r1, #4] + ldr r2, [lr] + mov r12, r0 + umlal r8, r12, r3, r2 + # r[1] * h[1] + ldr r2, [lr, #4] + adds r9, r9, r12 + adc r12, r0, r0 + umlal r9, r12, r3, r2 + # r[1] * h[2] + ldr r2, [lr, #8] + adds r10, r10, r12 + adc r12, r0, r0 + umlal r10, r12, r3, r2 + # r[1] * h[3] + ldr r2, [lr, #12] + adds r11, r11, r12 + adc r4, r0, r0 + umlal r11, r4, r3, r2 + # r[1] * h[4] + ldr r2, [lr, #16] + mla r4, r3, r2, r4 + # r[2] * h[0] + ldr r3, [r1, #8] + ldr r2, [lr] + mov r12, r0 + umlal r9, r12, r3, r2 + # r[2] * h[1] + ldr r2, [lr, #4] + adds r10, r10, r12 + adc r12, r0, r0 + umlal r10, r12, r3, r2 + # r[2] * h[2] + ldr r2, [lr, #8] + adds r11, r11, r12 + adc r12, r0, r0 + umlal r11, r12, r3, r2 + # r[2] * h[3] + ldr r2, [lr, #12] + adds r4, r4, r12 + adc r5, r0, r0 + umlal r4, r5, r3, r2 + # r[2] * h[4] + ldr r2, [lr, #16] + mla r5, r3, r2, r5 + # r[3] * h[0] + ldr r3, [r1, #12] + ldr r2, [lr] + mov r12, r0 + umlal r10, r12, r3, r2 + # r[3] * h[1] + ldr r2, [lr, #4] + adds r11, r11, r12 + adc r12, r0, r0 + umlal r11, r12, r3, r2 + # r[3] * h[2] + ldr r2, [lr, #8] + adds r4, r4, r12 + adc r5, r5, r0 + umlal r4, r5, r3, r2 + # r[3] * h[3] + ldr r2, [lr, #12] + mov r6, r0 + umlal r5, r6, r3, r2 + # r[3] * h[4] + ldr r2, [lr, #16] + mov r12, r0 + mla r6, r3, r2, r6 +#else + sub sp, sp, #12 + ldm r1, {r0, r1, r2, r3} + # r[0] * h[0] + umull r5, r6, r0, r7 + # r[1] * h[0] + umull r12, r10, r1, r7 + # r[0] * h[1] + umaal r6, r12, r0, r8 + # r[2] * h[0] + umull r11, r4, r2, r7 + # r[1] * h[1] + umaal r12, r11, r1, r8 + # r[0] * h[2] + umaal r12, r10, r0, r9 + # r[3] * h[0] + umaal r11, r4, r3, r7 + stm sp, {r5, r6, r12} + # r[2] * h[1] + umaal r10, r11, r2, r8 + # Replace h[0] with h[3] + ldr r7, [lr, #12] + # r[1] * h[2] + umull r5, r6, r1, r9 + # r[2] * h[2] + umaal r11, r4, r2, r9 + # r[0] * h[3] + umaal r10, r5, r0, r7 + # r[3] * h[1] + umaal r11, r6, r3, r8 + # r[1] * h[3] + umaal r11, r5, r1, r7 + # r[3] * h[2] + umaal r4, r6, r3, r9 + # r[2] * h[3] + umaal r4, r5, r2, r7 + # Replace h[1] with h[4] + ldr r8, [lr, #16] + # r[3] * h[3] + umaal r5, r6, r3, r7 + mov r12, #0 + # r[0] * h[4] + umaal r11, r12, r0, r8 + # r[1] * h[4] + umaal r4, r12, r1, r8 + # r[2] * h[4] + umaal r5, r12, r2, r8 + # r[3] * h[4] + umaal r6, r12, r3, r8 + # DONE + ldm sp, {r7, r8, r9} + add sp, sp, #12 +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + # Reduce mod 2^130 - 5 + bic r3, r11, #0x3 + and r11, r11, #3 + adds r7, r7, r3 + lsr r3, r3, #2 + adcs r8, r8, r4 + orr r3, r3, r4, LSL #30 + adcs r9, r9, r5 + lsr r4, r4, #2 + adcs r10, r10, r6 + orr r4, r4, r5, LSL #30 + adc r11, r11, r12 + lsr r5, r5, #2 + adds r7, r7, r3 + orr r5, r5, r6, LSL #30 + adcs r8, r8, r4 + lsr r6, r6, #2 + adcs r9, r9, r5 + adcs r10, r10, r6 + adc r11, r11, r12 + pop {r0, r1} + pop {r2} + add r12, r0, #16 + stm r12, {r7, r8, r9, r10, r11} +L_poly1305_arm32_blocks_done: + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size poly1305_arm32_blocks,.-poly1305_arm32_blocks + .text + .type L_poly1305_arm32_clamp, %object + .size L_poly1305_arm32_clamp, 16 + .align 4 +L_poly1305_arm32_clamp: + .word 0xfffffff + .word 0xffffffc + .word 0xffffffc + .word 0xffffffc + .text + .align 4 + .globl poly1305_set_key + .type poly1305_set_key, %function +poly1305_set_key: + push {r4, r5, r6, r7, r8, r9, r10, r11, lr} + vpush {d8-d15} + # Load mask. + adr lr, L_poly1305_arm32_clamp + ldm lr, {r6, r7, r8, r9} + # Load and cache padding. + ldr r2, [r1, #16] + ldr r3, [r1, #20] + ldr r4, [r1, #24] + ldr r5, [r1, #28] + add lr, r0, #40 + stm lr, {r2, r3, r4, r5} + # Load, mask and store r. + ldr r2, [r1] + ldr r3, [r1, #4] + ldr r4, [r1, #8] + ldr r5, [r1, #12] + and r2, r2, r6 + and r3, r3, r7 + and r4, r4, r8 + and r5, r5, r9 + add lr, r0, #0 + stm lr, {r2, r3, r4, r5} + vmov.i16 q10, #0xffff + vshr.u64 q10, q10, #38 + lsr r8, r2, #26 + lsr r9, r3, #20 + lsr r10, r4, #14 + lsr r11, r5, #8 + eor r8, r8, r3, lsl #6 + eor r9, r9, r4, lsl #12 + eor r10, r10, r5, lsl #18 + and r7, r2, #0x3ffffff + and r8, r8, #0x3ffffff + and r9, r9, #0x3ffffff + and r10, r10, #0x3ffffff + vmov.i32 s1, r7 + vmov.i32 s3, r8 + vmov.i32 s5, r9 + vmov.i32 s7, r10 + vmov.i32 s9, r11 + push {r0, r1} +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + # Square r + umull r1, r6, r2, r3 + mov r12, #0 + umull r7, r8, r2, r5 + mov lr, r12 + umlal r6, lr, r2, r4 + adds r7, r7, lr + adc lr, r12, r12 + umlal r7, lr, r3, r4 + mov r9, r12 + umlal lr, r9, r3, r5 + adds r8, r8, lr + adcs r9, r9, r12 + adc r10, r12, r12 + umlal r9, r10, r4, r5 + adds r1, r1, r1 + adcs r6, r6, r6 + adcs r7, r7, r7 + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + adc r11, r12, r12 + umull r0, lr, r2, r2 + adds r1, r1, lr + adcs r6, r6, r12 + adc lr, r12, r12 + umlal r6, lr, r3, r3 + adds r7, r7, lr + adcs r8, r8, r12 + adc lr, r12, r12 + umlal r8, lr, r4, r4 + adds r9, r9, lr + adcs r10, r10, r12 + adc r11, r11, r12 + umlal r10, r11, r5, r5 +#else + umull r0, r1, r2, r2 + umull r6, r7, r2, r3 + adds r6, r6, r6 + mov r12, #0 + umaal r1, r6, r12, r12 + mov r8, r12 + umaal r8, r7, r2, r4 + adcs r8, r8, r8 + umaal r6, r8, r3, r3 + umull r9, r10, r2, r5 + umaal r7, r9, r3, r4 + adcs r7, r7, r7 + umaal r7, r8, r12, r12 + umaal r10, r9, r3, r5 + adcs r10, r10, r10 + umaal r8, r10, r4, r4 + mov r11, r12 + umaal r9, r11, r4, r5 + adcs r9, r9, r9 + umaal r9, r10, r12, r12 + adcs r11, r11, r11 + umaal r10, r11, r5, r5 + adc r11, r11, r12 +#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */ + # Reduce mod 2^130 - 5 + bic r2, r8, #0x3 + and r8, r8, #3 + adds r0, r0, r2 + lsr r2, r2, #2 + adcs r1, r1, r9 + orr r2, r2, r9, LSL #30 + adcs r6, r6, r10 + lsr r9, r9, #2 + adcs r7, r7, r11 + orr r9, r9, r10, LSL #30 + adc r8, r8, r12 + lsr r10, r10, #2 + adds r0, r0, r2 + orr r10, r10, r11, LSL #30 + adcs r1, r1, r9 + lsr r11, r11, #2 + adcs r6, r6, r10 + adcs r7, r7, r11 + adc r8, r8, r12 + lsr r3, r0, #26 + lsr r4, r1, #20 + lsr r5, r6, #14 + lsr r10, r7, #8 + eor r3, r3, r1, lsl #6 + eor r4, r4, r6, lsl #12 + eor r5, r5, r7, lsl #18 + eor r10, r10, r8, lsl #24 + and r2, r0, #0x3ffffff + and r3, r3, #0x3ffffff + and r4, r4, #0x3ffffff + and r5, r5, #0x3ffffff + vmov.i32 s0, r2 + vmov.i32 s2, r3 + vmov.i32 s4, r4 + vmov.i32 s6, r5 + vmov.i32 s8, r10 + pop {r0, r1} + add lr, r0, #0x7c + vstm.32 lr, {d0-d4} + # Multiply r^2, r by r^2 + vshl.u32 d6, d1, #2 + vshl.u32 d7, d2, #2 + vshl.u32 d8, d3, #2 + vshl.u32 d9, d4, #2 + vadd.u32 d6, d6, d1 + vadd.u32 d7, d7, d2 + vadd.u32 d8, d8, d3 + vadd.u32 d9, d9, d4 + vmull.u32 q5, d0, d0[0] + vmull.u32 q6, d0, d1[0] + vmull.u32 q7, d0, d2[0] + vmull.u32 q8, d0, d3[0] + vmull.u32 q9, d0, d4[0] + vmlal.u32 q5, d1, d9[0] + vmlal.u32 q6, d1, d0[0] + vmlal.u32 q7, d1, d1[0] + vmlal.u32 q8, d1, d2[0] + vmlal.u32 q9, d1, d3[0] + vmlal.u32 q5, d2, d8[0] + vmlal.u32 q6, d2, d9[0] + vmlal.u32 q7, d2, d0[0] + vmlal.u32 q8, d2, d1[0] + vmlal.u32 q9, d2, d2[0] + vmlal.u32 q5, d3, d7[0] + vmlal.u32 q6, d3, d8[0] + vmlal.u32 q7, d3, d9[0] + vmlal.u32 q8, d3, d0[0] + vmlal.u32 q9, d3, d1[0] + vmlal.u32 q5, d4, d6[0] + vmlal.u32 q6, d4, d7[0] + vmlal.u32 q7, d4, d8[0] + vmlal.u32 q8, d4, d9[0] + vmlal.u32 q9, d4, d0[0] + vsra.u64 q6, q5, #26 + vand.u64 q5, q5, q10 + vsra.u64 q7, q6, #26 + vand.u64 q6, q6, q10 + vsra.u64 q8, q7, #26 + vand.u64 q7, q7, q10 + vsra.u64 q9, q8, #26 + vand.u64 q8, q8, q10 + vshr.u64 q3, q9, #26 + vand.u64 q9, q9, q10 + vadd.u64 q5, q5, q3 + vshl.u64 q3, q3, #2 + vadd.u64 q5, q5, q3 + vsra.u64 q6, q5, #26 + vand.u64 q5, q5, q10 + vmovn.i64 d10, q5 + vmovn.i64 d11, q6 + vmovn.i64 d12, q7 + vmovn.i64 d13, q8 + vmovn.i64 d14, q9 + add lr, r0, #0xa4 + vstm.32 lr, {d10-d14} + # h (accumulator) = 0 + eor r6, r6, r6 + eor r7, r7, r7 + eor r8, r8, r8 + eor r9, r9, r9 + add lr, r0, #16 + eor r4, r4, r4 + eor r5, r5, r5 + stm lr, {r4, r5, r6, r7, r8, r9} + # Zero leftover + str r5, [r0, #56] + vpop {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} + .size poly1305_set_key,.-poly1305_set_key + .text + .align 4 + .globl poly1305_final + .type poly1305_final, %function +poly1305_final: + push {r4, r5, r6, r7, r8, r9, lr} + add r9, r0, #16 + ldm r9, {r4, r5, r6, r7, r8} + # Add 5 and check for h larger than p. + adds r2, r4, #5 + adcs r2, r5, #0 + adcs r2, r6, #0 + adcs r2, r7, #0 + adc r2, r8, #0 + sub r2, r2, #4 + lsr r2, r2, #31 + sub r2, r2, #1 + and r2, r2, #5 + # Add 0/5 to h. + adds r4, r4, r2 + adcs r5, r5, #0 + adcs r6, r6, #0 + adc r7, r7, #0 + # Add padding + add r9, r0, #40 + ldm r9, {r2, r3, r12, lr} + adds r4, r4, r2 + adcs r5, r5, r3 + adcs r6, r6, r12 + adc r7, r7, lr + # Store MAC + str r4, [r1] + str r5, [r1, #4] + str r6, [r1, #8] + str r7, [r1, #12] + # Zero out h. + eor r4, r4, r4 + eor r5, r5, r5 + eor r6, r6, r6 + eor r7, r7, r7 + eor r8, r8, r8 + add r9, r0, #16 + stm r9, {r4, r5, r6, r7, r8} + # Zero out r. + add r9, r0, #0 + stm r9, {r4, r5, r6, r7} + # Zero out padding. + add r9, r0, #40 + stm r9, {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r9, pc} + .size poly1305_final,.-poly1305_final +#endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* HAVE_POLY1305 */ #endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c index d12e4c19bf..1c53b71806 100644 --- a/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-32-poly1305-asm_c.c @@ -50,9 +50,10 @@ #define __volatile__ volatile #endif /* __KEIL__ */ #ifdef HAVE_POLY1305 +#ifdef WOLFSSL_ARMASM_NO_NEON #include -void poly1305_blocks_arm32_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, +void poly1305_arm32_blocks_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, int notLast_p) { register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; @@ -383,6 +384,976 @@ void poly1305_final(Poly1305* ctx_p, byte* mac_p) ); } +#else +void poly1305_arm32_blocks_16(Poly1305* ctx_p, const byte* m_p, word32 len_p, + int notLast_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const byte* m asm ("r1") = (const byte*)m_p; + register word32 len asm ("r2") = (word32)len_p; + register int notLast asm ("r3") = (int)notLast_p; + + __asm__ __volatile__ ( + "sub sp, sp, #28\n\t" + "cmp %[len], #0\n\t" + "beq L_poly1305_arm32_16_done_%=\n\t" + "add lr, sp, #12\n\t" + "stm lr, {r0, r1, r2, r3}\n\t" + /* Get h pointer */ + "add lr, %[ctx], #16\n\t" + "ldm lr, {r4, r5, r6, r7, r8}\n\t" + "\n" + "L_poly1305_arm32_16_loop_%=: \n\t" + /* Add m to h */ + "ldr %[m], [sp, #16]\n\t" + "ldr %[len], [%[m]]\n\t" + "ldr %[notLast], [%[m], #4]\n\t" + "ldr r9, [%[m], #8]\n\t" + "ldr r10, [%[m], #12]\n\t" + "ldr r11, [sp, #24]\n\t" + "adds r4, r4, %[len]\n\t" + "adcs r5, r5, %[notLast]\n\t" + "adcs r6, r6, r9\n\t" + "adcs r7, r7, r10\n\t" + "add %[m], %[m], #16\n\t" + "adc r8, r8, r11\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "stm lr, {r4, r5, r6, r7, r8}\n\t" +#else + /* h[0]-h[2] in r4-r6 for multiplication. */ + "str r7, [lr, #12]\n\t" + "str r8, [lr, #16]\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + "str %[m], [sp, #16]\n\t" + "ldr %[m], [sp, #12]\n\t" + /* Multiply h by r */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ + "ldr %[notLast], [%[m]]\n\t" + "eor %[ctx], %[ctx], %[ctx]\n\t" + /* r[0] * h[0] */ + /* h[0] in r4 */ + "umull r4, r5, %[notLast], r4\n\t" + /* r[0] * h[2] */ + /* h[2] in r6 */ + "umull r6, r7, %[notLast], r6\n\t" + /* r[0] * h[4] */ + /* h[4] in r8 */ + "mul r8, %[notLast], r8\n\t" + /* r[0] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r5, r12, %[notLast], %[len]\n\t" + /* r[0] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r6, r6, r12\n\t" + "adc r7, r7, %[ctx]\n\t" + "umlal r7, r8, %[notLast], %[len]\n\t" + /* r[1] * h[0] */ + "ldr %[notLast], [%[m], #4]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r5, r12, %[notLast], %[len]\n\t" + /* r[1] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r6, r6, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r6, r12, %[notLast], %[len]\n\t" + /* r[1] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r7, r7, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[1] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r8, r8, r12\n\t" + "adc r9, %[ctx], %[ctx]\n\t" + "umlal r8, r9, %[notLast], %[len]\n\t" + /* r[1] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mla r9, %[notLast], %[len], r9\n\t" + /* r[2] * h[0] */ + "ldr %[notLast], [%[m], #8]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r6, r12, %[notLast], %[len]\n\t" + /* r[2] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r7, r7, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[2] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r8, r8, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r8, r12, %[notLast], %[len]\n\t" + /* r[2] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, %[ctx], %[ctx]\n\t" + "umlal r9, r10, %[notLast], %[len]\n\t" + /* r[2] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mla r10, %[notLast], %[len], r10\n\t" + /* r[3] * h[0] */ + "ldr %[notLast], [%[m], #12]\n\t" + "ldr %[len], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r7, r12, %[notLast], %[len]\n\t" + /* r[3] * h[1] */ + "ldr %[len], [lr, #4]\n\t" + "adds r8, r8, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r8, r12, %[notLast], %[len]\n\t" + /* r[3] * h[2] */ + "ldr %[len], [lr, #8]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, r10, %[ctx]\n\t" + "umlal r9, r10, %[notLast], %[len]\n\t" + /* r[3] * h[3] */ + "ldr %[len], [lr, #12]\n\t" + "mov r11, %[ctx]\n\t" + "umlal r10, r11, %[notLast], %[len]\n\t" + /* r[3] * h[4] */ + "ldr %[len], [lr, #16]\n\t" + "mov r12, %[ctx]\n\t" + "mla r11, %[notLast], %[len], r11\n\t" +#else + "ldm %[m], {r0, r1, r2, r3}\n\t" + /* r[0] * h[0] */ + "umull r10, r11, %[ctx], r4\n\t" + /* r[1] * h[0] */ + "umull r12, r7, %[m], r4\n\t" + /* r[0] * h[1] */ + "umaal r11, r12, %[ctx], r5\n\t" + /* r[2] * h[0] */ + "umull r8, r9, %[len], r4\n\t" + /* r[1] * h[1] */ + "umaal r12, r8, %[m], r5\n\t" + /* r[0] * h[2] */ + "umaal r12, r7, %[ctx], r6\n\t" + /* r[3] * h[0] */ + "umaal r8, r9, %[notLast], r4\n\t" + "stm sp, {r10, r11, r12}\n\t" + /* r[2] * h[1] */ + "umaal r7, r8, %[len], r5\n\t" + /* Replace h[0] with h[3] */ + "ldr r4, [lr, #12]\n\t" + /* r[1] * h[2] */ + "umull r10, r11, %[m], r6\n\t" + /* r[2] * h[2] */ + "umaal r8, r9, %[len], r6\n\t" + /* r[0] * h[3] */ + "umaal r7, r10, %[ctx], r4\n\t" + /* r[3] * h[1] */ + "umaal r8, r11, %[notLast], r5\n\t" + /* r[1] * h[3] */ + "umaal r8, r10, %[m], r4\n\t" + /* r[3] * h[2] */ + "umaal r9, r11, %[notLast], r6\n\t" + /* r[2] * h[3] */ + "umaal r9, r10, %[len], r4\n\t" + /* Replace h[1] with h[4] */ + "ldr r5, [lr, #16]\n\t" + /* r[3] * h[3] */ + "umaal r10, r11, %[notLast], r4\n\t" + "mov r12, #0\n\t" + /* r[0] * h[4] */ + "umaal r8, r12, %[ctx], r5\n\t" + /* r[1] * h[4] */ + "umaal r9, r12, %[m], r5\n\t" + /* r[2] * h[4] */ + "umaal r10, r12, %[len], r5\n\t" + /* r[3] * h[4] */ + "umaal r11, r12, %[notLast], r5\n\t" + /* DONE */ + "ldm sp, {r4, r5, r6}\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + /* r12 will be zero because r is masked. */ + /* Load length */ + "ldr %[len], [sp, #20]\n\t" + /* Reduce mod 2^130 - 5 */ + "bic %[notLast], r8, #0x3\n\t" + "and r8, r8, #3\n\t" + "adds r4, r4, %[notLast]\n\t" + "lsr %[notLast], %[notLast], #2\n\t" + "adcs r5, r5, r9\n\t" + "orr %[notLast], %[notLast], r9, LSL #30\n\t" + "adcs r6, r6, r10\n\t" + "lsr r9, r9, #2\n\t" + "adcs r7, r7, r11\n\t" + "orr r9, r9, r10, LSL #30\n\t" + "adc r8, r8, r12\n\t" + "lsr r10, r10, #2\n\t" + "adds r4, r4, %[notLast]\n\t" + "orr r10, r10, r11, LSL #30\n\t" + "adcs r5, r5, r9\n\t" + "lsr r11, r11, #2\n\t" + "adcs r6, r6, r10\n\t" + "adcs r7, r7, r11\n\t" + "adc r8, r8, r12\n\t" + /* Sub 16 from length. */ + "subs %[len], %[len], #16\n\t" + /* Store length. */ + "str %[len], [sp, #20]\n\t" + /* Loop again if more message to do. */ + "bgt L_poly1305_arm32_16_loop_%=\n\t" + "stm lr, {r4, r5, r6, r7, r8}\n\t" + "\n" + "L_poly1305_arm32_16_done_%=: \n\t" + "add sp, sp, #28\n\t" + : [ctx] "+r" (ctx), [m] "+r" (m), [len] "+r" (len), + [notLast] "+r" (notLast) + : + : "memory", "cc", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11" + ); +} + +static void poly1305_arm32_blocks(Poly1305* ctx_p, const unsigned char* m_p, + size_t bytes_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const unsigned char* m asm ("r1") = (const unsigned char*)m_p; + register size_t bytes asm ("r2") = (size_t)bytes_p; + + __asm__ __volatile__ ( + "cmp %[bytes], #16\n\t" + "add r12, %[ctx], #16\n\t" + "bgt L_poly1305_arm32_blocks_begin_neon_%=\n\t" + "ldm r12, {r7, r8, r9, r10, r11}\n\t" + "b L_poly1305_arm32_blocks_start_1_%=\n\t" + "\n" + "L_poly1305_arm32_blocks_begin_neon_%=: \n\t" + "vmov.i16 q15, #0xffff\n\t" + "vshr.u64 q15, q15, #38\n\t" + "vld1.64 {d0-d2}, [r12]\n\t" + "vshl.u64 d4, d2, #24\n\t" + "vsri.u64 d4, d1, #40\n\t" + "vshr.u64 d3, d1, #14\n\t" + "vshl.u64 d2, d1, #12\n\t" + "vsri.u64 d1, d0, #26\n\t" + "vsri.u64 d2, d0, #52\n\t" + "vand.u64 d0, d0, d31\n\t" + "vand.u64 d3, d3, d31\n\t" + "vand.u64 d2, d2, d31\n\t" + "vand.u64 d1, d1, d31\n\t" + "add r3, %[ctx], #0x7c\n\t" + "vldm.32 r3, {d20-d24}\n\t" + "cmp %[bytes], #0x40\n\t" + "bge L_poly1305_arm32_blocks_begin_4_%=\n\t" + "vshl.u32 d6, d21, #2\n\t" + "vshl.u32 d7, d22, #2\n\t" + "vshl.u32 d8, d23, #2\n\t" + "vshl.u32 d9, d24, #2\n\t" + "vadd.u32 d6, d6, d21\n\t" + "vadd.u32 d7, d7, d22\n\t" + "vadd.u32 d8, d8, d23\n\t" + "vadd.u32 d9, d9, d24\n\t" + "b L_poly1305_arm32_blocks_start_2_%=\n\t" + "\n" + "L_poly1305_arm32_blocks_begin_4_%=: \n\t" + "add r3, %[ctx], #0xa4\n\t" + "vldm.32 r3, {d26-d30}\n\t" + "\n" + "L_poly1305_arm32_blocks_start_4_%=: \n\t" + "sub %[bytes], #0x40\n\t" + "vld4.32 {d10-d13}, [%[m]]!\n\t" + "vshl.u32 d6, d27, #2\n\t" + "vshl.u32 d7, d28, #2\n\t" + "vshl.u32 d8, d29, #2\n\t" + "vshl.u32 d9, d30, #2\n\t" + "vadd.u32 d6, d6, d27\n\t" + "vadd.u32 d7, d7, d28\n\t" + "vadd.u32 d8, d8, d29\n\t" + "vadd.u32 d9, d9, d30\n\t" + "vshr.u32 d14, d13, #8\n\t" + "vshl.u32 d13, d13, #18\n\t" + "vorr.i32 d14, d14, #0x1000000\n\t" + "vsri.u32 d13, d12, #14\n\t" + "vshl.u32 d12, d12, #12\n\t" + "vand.i32 d13, d13, #0x3ffffff\n\t" + "vsri.u32 d12, d11, #20\n\t" + "vshl.u32 d11, d11, #6\n\t" + "vand.i32 d12, d12, #0x3ffffff\n\t" + "vsri.u32 d11, d10, #26\n\t" + "vand.i32 d10, d10, #0x3ffffff\n\t" + "vand.i32 d11, d11, #0x3ffffff\n\t" + "vadd.u32 d4, d4, d14\n\t" + "vadd.u32 q1, q1, q6\n\t" + "vadd.u32 q0, q0, q5\n\t" + "vmull.u32 q5, d0, d26\n\t" + "vmull.u32 q6, d0, d27\n\t" + "vmull.u32 q7, d0, d28\n\t" + "vmull.u32 q8, d0, d29\n\t" + "vmull.u32 q9, d0, d30\n\t" + "vmlal.u32 q5, d1, d9\n\t" + "vmlal.u32 q6, d1, d26\n\t" + "vmlal.u32 q7, d1, d27\n\t" + "vmlal.u32 q8, d1, d28\n\t" + "vmlal.u32 q9, d1, d29\n\t" + "vmlal.u32 q5, d2, d8\n\t" + "vmlal.u32 q6, d2, d9\n\t" + "vmlal.u32 q7, d2, d26\n\t" + "vmlal.u32 q8, d2, d27\n\t" + "vmlal.u32 q9, d2, d28\n\t" + "vmlal.u32 q5, d3, d7\n\t" + "vmlal.u32 q6, d3, d8\n\t" + "vmlal.u32 q7, d3, d9\n\t" + "vmlal.u32 q8, d3, d26\n\t" + "vmlal.u32 q9, d3, d27\n\t" + "vmlal.u32 q5, d4, d6\n\t" + "vmlal.u32 q6, d4, d7\n\t" + "vmlal.u32 q7, d4, d8\n\t" + "vmlal.u32 q8, d4, d9\n\t" + "vmlal.u32 q9, d4, d26\n\t" + "vld4.32 {d0-d3}, [%[m]]!\n\t" + "vshl.u32 d6, d21, #2\n\t" + "vshl.u32 d7, d22, #2\n\t" + "vshl.u32 d8, d23, #2\n\t" + "vshl.u32 d9, d24, #2\n\t" + "vadd.u32 d6, d6, d21\n\t" + "vadd.u32 d7, d7, d22\n\t" + "vadd.u32 d8, d8, d23\n\t" + "vadd.u32 d9, d9, d24\n\t" + "vshr.u32 d4, d3, #8\n\t" + "vshl.u32 d3, d3, #18\n\t" + "vorr.i32 d4, d4, #0x1000000\n\t" + "vsri.u32 d3, d2, #14\n\t" + "vshl.u32 d2, d2, #12\n\t" + "vand.i32 d3, d3, #0x3ffffff\n\t" + "vsri.u32 d2, d1, #20\n\t" + "vshl.u32 d1, d1, #6\n\t" + "vand.i32 d2, d2, #0x3ffffff\n\t" + "vsri.u32 d1, d0, #26\n\t" + "vand.i32 d0, d0, #0x3ffffff\n\t" + "vand.i32 d1, d1, #0x3ffffff\n\t" + "vmlal.u32 q5, d0, d20\n\t" + "vmlal.u32 q6, d0, d21\n\t" + "vmlal.u32 q7, d0, d22\n\t" + "vmlal.u32 q8, d0, d23\n\t" + "vmlal.u32 q9, d0, d24\n\t" + "vmlal.u32 q5, d1, d9\n\t" + "vmlal.u32 q6, d1, d20\n\t" + "vmlal.u32 q7, d1, d21\n\t" + "vmlal.u32 q8, d1, d22\n\t" + "vmlal.u32 q9, d1, d23\n\t" + "vmlal.u32 q5, d2, d8\n\t" + "vmlal.u32 q6, d2, d9\n\t" + "vmlal.u32 q7, d2, d20\n\t" + "vmlal.u32 q8, d2, d21\n\t" + "vmlal.u32 q9, d2, d22\n\t" + "vmlal.u32 q5, d3, d7\n\t" + "vmlal.u32 q6, d3, d8\n\t" + "vmlal.u32 q7, d3, d9\n\t" + "vmlal.u32 q8, d3, d20\n\t" + "vmlal.u32 q9, d3, d21\n\t" + "vmlal.u32 q5, d4, d6\n\t" + "vmlal.u32 q6, d4, d7\n\t" + "vmlal.u32 q7, d4, d8\n\t" + "vmlal.u32 q8, d4, d9\n\t" + "vmlal.u32 q9, d4, d20\n\t" + "vadd.u64 d0, d10, d11\n\t" + "vadd.u64 d1, d12, d13\n\t" + "vadd.u64 d2, d14, d15\n\t" + "vadd.u64 d3, d16, d17\n\t" + "vadd.u64 d4, d18, d19\n\t" + "vsra.u64 d1, d0, #26\n\t" + "vand.u64 d0, d0, d31\n\t" + "vsra.u64 d2, d1, #26\n\t" + "vand.u64 d1, d1, d31\n\t" + "vsra.u64 d3, d2, #26\n\t" + "vand.u64 d2, d2, d31\n\t" + "vsra.u64 d4, d3, #26\n\t" + "vand.u64 d3, d3, d31\n\t" + "vshr.u64 d15, d4, #26\n\t" + "vand.u64 d4, d4, d31\n\t" + "vadd.u64 d0, d0, d15\n\t" + "vshl.u64 d15, d15, #2\n\t" + "vadd.u64 d0, d0, d15\n\t" + "vsra.u64 d1, d0, #26\n\t" + "vand.u64 d0, d0, d31\n\t" + "cmp %[bytes], #0x40\n\t" + "bge L_poly1305_arm32_blocks_start_4_%=\n\t" + "cmp %[bytes], #32\n\t" + "blt L_poly1305_arm32_blocks_done_neon_%=\n\t" + "\n" + "L_poly1305_arm32_blocks_start_2_%=: \n\t" + "sub %[bytes], #32\n\t" + "vld4.32 {d10-d13}, [%[m]]!\n\t" + "vshr.u32 d14, d13, #8\n\t" + "vshl.u32 d13, d13, #18\n\t" + "vorr.i32 d14, d14, #0x1000000\n\t" + "vsri.u32 d13, d12, #14\n\t" + "vshl.u32 d12, d12, #12\n\t" + "vand.i32 d13, d13, #0x3ffffff\n\t" + "vsri.u32 d12, d11, #20\n\t" + "vshl.u32 d11, d11, #6\n\t" + "vand.i32 d12, d12, #0x3ffffff\n\t" + "vsri.u32 d11, d10, #26\n\t" + "vand.i32 d10, d10, #0x3ffffff\n\t" + "vand.i32 d11, d11, #0x3ffffff\n\t" + "vadd.u32 d4, d4, d14\n\t" + "vadd.u32 q1, q1, q6\n\t" + "vadd.u32 q0, q0, q5\n\t" + "vmull.u32 q5, d0, d20\n\t" + "vmull.u32 q6, d0, d21\n\t" + "vmull.u32 q7, d0, d22\n\t" + "vmull.u32 q8, d0, d23\n\t" + "vmull.u32 q9, d0, d24\n\t" + "vmlal.u32 q5, d1, d9\n\t" + "vmlal.u32 q6, d1, d20\n\t" + "vmlal.u32 q7, d1, d21\n\t" + "vmlal.u32 q8, d1, d22\n\t" + "vmlal.u32 q9, d1, d23\n\t" + "vmlal.u32 q5, d2, d8\n\t" + "vmlal.u32 q6, d2, d9\n\t" + "vmlal.u32 q7, d2, d20\n\t" + "vmlal.u32 q8, d2, d21\n\t" + "vmlal.u32 q9, d2, d22\n\t" + "vmlal.u32 q5, d3, d7\n\t" + "vmlal.u32 q6, d3, d8\n\t" + "vmlal.u32 q7, d3, d9\n\t" + "vmlal.u32 q8, d3, d20\n\t" + "vmlal.u32 q9, d3, d21\n\t" + "vmlal.u32 q5, d4, d6\n\t" + "vmlal.u32 q6, d4, d7\n\t" + "vmlal.u32 q7, d4, d8\n\t" + "vmlal.u32 q8, d4, d9\n\t" + "vmlal.u32 q9, d4, d20\n\t" + "vadd.u64 d0, d10, d11\n\t" + "vadd.u64 d1, d12, d13\n\t" + "vadd.u64 d2, d14, d15\n\t" + "vadd.u64 d3, d16, d17\n\t" + "vadd.u64 d4, d18, d19\n\t" + "vsra.u64 d1, d0, #26\n\t" + "vand.u64 d0, d0, d31\n\t" + "vsra.u64 d2, d1, #26\n\t" + "vand.u64 d1, d1, d31\n\t" + "vsra.u64 d3, d2, #26\n\t" + "vand.u64 d2, d2, d31\n\t" + "vsra.u64 d4, d3, #26\n\t" + "vand.u64 d3, d3, d31\n\t" + "vshr.u64 d5, d4, #26\n\t" + "vand.u64 d4, d4, d31\n\t" + "vadd.u64 d0, d0, d5\n\t" + "vshl.u64 d5, d5, #2\n\t" + "vadd.u64 d0, d0, d5\n\t" + "vsra.u64 d1, d0, #26\n\t" + "vand.u64 d0, d0, d31\n\t" + "\n" + "L_poly1305_arm32_blocks_done_neon_%=: \n\t" + "cmp %[bytes], #16\n\t" + "beq L_poly1305_arm32_blocks_begin_1_%=\n\t" + "add r12, %[ctx], #16\n\t" + "vsli.u64 d0, d1, #26\n\t" + "vsli.u64 d0, d2, #52\n\t" + "vshr.u64 d1, d2, #12\n\t" + "vsli.u64 d1, d3, #14\n\t" + "vsli.u64 d1, d4, #40\n\t" + "vshr.u64 d2, d4, #24\n\t" + "vst1.64 {d0-d2}, [r12]\n\t" + "b L_poly1305_arm32_blocks_done_%=\n\t" + "\n" + "L_poly1305_arm32_blocks_begin_1_%=: \n\t" + "vsli.u64 d0, d1, #26\n\t" + "vsli.u64 d0, d2, #52\n\t" + "vshr.u64 d1, d2, #12\n\t" + "vsli.u64 d1, d3, #14\n\t" + "vsli.u64 d1, d4, #40\n\t" + "vshr.u64 d2, d4, #24\n\t" + "vmov r7, r8, d0\n\t" + "vmov r9, r10, d1\n\t" + "vmov r11, d2[0]\n\t" + "\n" + "L_poly1305_arm32_blocks_start_1_%=: \n\t" + "mov r12, #1\n\t" + "push {r2}\n\t" + /* Load message */ + "ldm %[m], {r2, r3, r4, r5}\n\t" + /* Add message */ + "adds r7, r7, %[bytes]\n\t" + "adcs r8, r8, r3\n\t" + "adcs r9, r9, r4\n\t" + "adcs r10, r10, r5\n\t" + "adc r11, r11, r12\n\t" + "push {r0-r1}\n\t" + "add %[m], %[ctx], #0\n\t" + "add lr, %[ctx], #16\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + "stm lr, {r7, r8, r9, r10, r11}\n\t" +#else + /* h[0]-h[2] in r4-r6 for multiplication. */ + "str r10, [lr, #12]\n\t" + "str r11, [lr, #16]\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ + "ldr r3, [%[m]]\n\t" + "eor %[ctx], %[ctx], %[ctx]\n\t" + /* r[0] * h[0] */ + /* h[0] in r4 */ + "umull r7, r8, r3, r7\n\t" + /* r[0] * h[2] */ + /* h[2] in r6 */ + "umull r9, r10, r3, r9\n\t" + /* r[0] * h[4] */ + /* h[4] in r8 */ + "mul r11, r3, r11\n\t" + /* r[0] * h[1] */ + "ldr %[bytes], [lr, #4]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r8, r12, r3, %[bytes]\n\t" + /* r[0] * h[3] */ + "ldr %[bytes], [lr, #12]\n\t" + "adds r9, r9, r12\n\t" + "adc r10, r10, %[ctx]\n\t" + "umlal r10, r11, r3, %[bytes]\n\t" + /* r[1] * h[0] */ + "ldr r3, [%[m], #4]\n\t" + "ldr %[bytes], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r8, r12, r3, %[bytes]\n\t" + /* r[1] * h[1] */ + "ldr %[bytes], [lr, #4]\n\t" + "adds r9, r9, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r9, r12, r3, %[bytes]\n\t" + /* r[1] * h[2] */ + "ldr %[bytes], [lr, #8]\n\t" + "adds r10, r10, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r10, r12, r3, %[bytes]\n\t" + /* r[1] * h[3] */ + "ldr %[bytes], [lr, #12]\n\t" + "adds r11, r11, r12\n\t" + "adc r4, %[ctx], %[ctx]\n\t" + "umlal r11, r4, r3, %[bytes]\n\t" + /* r[1] * h[4] */ + "ldr %[bytes], [lr, #16]\n\t" + "mla r4, r3, %[bytes], r4\n\t" + /* r[2] * h[0] */ + "ldr r3, [%[m], #8]\n\t" + "ldr %[bytes], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r9, r12, r3, %[bytes]\n\t" + /* r[2] * h[1] */ + "ldr %[bytes], [lr, #4]\n\t" + "adds r10, r10, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r10, r12, r3, %[bytes]\n\t" + /* r[2] * h[2] */ + "ldr %[bytes], [lr, #8]\n\t" + "adds r11, r11, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r11, r12, r3, %[bytes]\n\t" + /* r[2] * h[3] */ + "ldr %[bytes], [lr, #12]\n\t" + "adds r4, r4, r12\n\t" + "adc r5, %[ctx], %[ctx]\n\t" + "umlal r4, r5, r3, %[bytes]\n\t" + /* r[2] * h[4] */ + "ldr %[bytes], [lr, #16]\n\t" + "mla r5, r3, %[bytes], r5\n\t" + /* r[3] * h[0] */ + "ldr r3, [%[m], #12]\n\t" + "ldr %[bytes], [lr]\n\t" + "mov r12, %[ctx]\n\t" + "umlal r10, r12, r3, %[bytes]\n\t" + /* r[3] * h[1] */ + "ldr %[bytes], [lr, #4]\n\t" + "adds r11, r11, r12\n\t" + "adc r12, %[ctx], %[ctx]\n\t" + "umlal r11, r12, r3, %[bytes]\n\t" + /* r[3] * h[2] */ + "ldr %[bytes], [lr, #8]\n\t" + "adds r4, r4, r12\n\t" + "adc r5, r5, %[ctx]\n\t" + "umlal r4, r5, r3, %[bytes]\n\t" + /* r[3] * h[3] */ + "ldr %[bytes], [lr, #12]\n\t" + "mov r6, %[ctx]\n\t" + "umlal r5, r6, r3, %[bytes]\n\t" + /* r[3] * h[4] */ + "ldr %[bytes], [lr, #16]\n\t" + "mov r12, %[ctx]\n\t" + "mla r6, r3, %[bytes], r6\n\t" +#else + "sub sp, sp, #12\n\t" + "ldm %[m], {r0, r1, r2, r3}\n\t" + /* r[0] * h[0] */ + "umull r5, r6, %[ctx], r7\n\t" + /* r[1] * h[0] */ + "umull r12, r10, %[m], r7\n\t" + /* r[0] * h[1] */ + "umaal r6, r12, %[ctx], r8\n\t" + /* r[2] * h[0] */ + "umull r11, r4, %[bytes], r7\n\t" + /* r[1] * h[1] */ + "umaal r12, r11, %[m], r8\n\t" + /* r[0] * h[2] */ + "umaal r12, r10, %[ctx], r9\n\t" + /* r[3] * h[0] */ + "umaal r11, r4, r3, r7\n\t" + "stm sp, {r5, r6, r12}\n\t" + /* r[2] * h[1] */ + "umaal r10, r11, %[bytes], r8\n\t" + /* Replace h[0] with h[3] */ + "ldr r7, [lr, #12]\n\t" + /* r[1] * h[2] */ + "umull r5, r6, %[m], r9\n\t" + /* r[2] * h[2] */ + "umaal r11, r4, %[bytes], r9\n\t" + /* r[0] * h[3] */ + "umaal r10, r5, %[ctx], r7\n\t" + /* r[3] * h[1] */ + "umaal r11, r6, r3, r8\n\t" + /* r[1] * h[3] */ + "umaal r11, r5, %[m], r7\n\t" + /* r[3] * h[2] */ + "umaal r4, r6, r3, r9\n\t" + /* r[2] * h[3] */ + "umaal r4, r5, %[bytes], r7\n\t" + /* Replace h[1] with h[4] */ + "ldr r8, [lr, #16]\n\t" + /* r[3] * h[3] */ + "umaal r5, r6, r3, r7\n\t" + "mov r12, #0\n\t" + /* r[0] * h[4] */ + "umaal r11, r12, %[ctx], r8\n\t" + /* r[1] * h[4] */ + "umaal r4, r12, %[m], r8\n\t" + /* r[2] * h[4] */ + "umaal r5, r12, %[bytes], r8\n\t" + /* r[3] * h[4] */ + "umaal r6, r12, r3, r8\n\t" + /* DONE */ + "ldm sp, {r7, r8, r9}\n\t" + "add sp, sp, #12\n\t" +#endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ + /* Reduce mod 2^130 - 5 */ + "bic r3, r11, #0x3\n\t" + "and r11, r11, #3\n\t" + "adds r7, r7, r3\n\t" + "lsr r3, r3, #2\n\t" + "adcs r8, r8, r4\n\t" + "orr r3, r3, r4, LSL #30\n\t" + "adcs r9, r9, r5\n\t" + "lsr r4, r4, #2\n\t" + "adcs r10, r10, r6\n\t" + "orr r4, r4, r5, LSL #30\n\t" + "adc r11, r11, r12\n\t" + "lsr r5, r5, #2\n\t" + "adds r7, r7, r3\n\t" + "orr r5, r5, r6, LSL #30\n\t" + "adcs r8, r8, r4\n\t" + "lsr r6, r6, #2\n\t" + "adcs r9, r9, r5\n\t" + "adcs r10, r10, r6\n\t" + "adc r11, r11, r12\n\t" + "pop {r0-r1}\n\t" + "pop {r2}\n\t" + "add r12, %[ctx], #16\n\t" + "stm r12, {r7, r8, r9, r10, r11}\n\t" + "\n" + "L_poly1305_arm32_blocks_done_%=: \n\t" + : [ctx] "+r" (ctx), [m] "+r" (m), [bytes] "+r" (bytes) + : + : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", + "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", + "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", + "d28", "d29", "d30", "d31" + ); +} + +static const word32 L_poly1305_arm32_clamp[] = { + 0x0fffffff, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, +}; + +void poly1305_set_key(Poly1305* ctx_p, const byte* key_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register const byte* key asm ("r1") = (const byte*)key_p; + register word32* L_poly1305_arm32_clamp_c asm ("r2") = + (word32*)&L_poly1305_arm32_clamp; + + __asm__ __volatile__ ( + /* Load mask. */ + "mov lr, %[L_poly1305_arm32_clamp]\n\t" + "ldm lr, {r6, r7, r8, r9}\n\t" + /* Load and cache padding. */ + "ldr r2, [%[key], #16]\n\t" + "ldr r3, [%[key], #20]\n\t" + "ldr r4, [%[key], #24]\n\t" + "ldr r5, [%[key], #28]\n\t" + "add lr, %[ctx], #40\n\t" + "stm lr, {r2, r3, r4, r5}\n\t" + /* Load, mask and store r. */ + "ldr r2, [%[key]]\n\t" + "ldr r3, [%[key], #4]\n\t" + "ldr r4, [%[key], #8]\n\t" + "ldr r5, [%[key], #12]\n\t" + "and r2, r2, r6\n\t" + "and r3, r3, r7\n\t" + "and r4, r4, r8\n\t" + "and r5, r5, r9\n\t" + "add lr, %[ctx], #0\n\t" + "stm lr, {r2, r3, r4, r5}\n\t" + "vmov.i16 q10, #0xffff\n\t" + "vshr.u64 q10, q10, #38\n\t" + "lsr r8, r2, #26\n\t" + "lsr r9, r3, #20\n\t" + "lsr r10, r4, #14\n\t" + "lsr r11, r5, #8\n\t" + "eor r8, r8, r3, lsl #6\n\t" + "eor r9, r9, r4, lsl #12\n\t" + "eor r10, r10, r5, lsl #18\n\t" + "and r7, r2, #0x3ffffff\n\t" + "and r8, r8, #0x3ffffff\n\t" + "and r9, r9, #0x3ffffff\n\t" + "and r10, r10, #0x3ffffff\n\t" + "vmov.i32 s1, r7\n\t" + "vmov.i32 s3, r8\n\t" + "vmov.i32 s5, r9\n\t" + "vmov.i32 s7, r10\n\t" + "vmov.i32 s9, r11\n\t" + "push {%[ctx]-%[key]}\n\t" +#if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) + /* Square r */ + "umull %[key], r6, r2, r3\n\t" + "mov r12, #0\n\t" + "umull r7, r8, r2, r5\n\t" + "mov lr, r12\n\t" + "umlal r6, lr, r2, r4\n\t" + "adds r7, r7, lr\n\t" + "adc lr, r12, r12\n\t" + "umlal r7, lr, r3, r4\n\t" + "mov r9, r12\n\t" + "umlal lr, r9, r3, r5\n\t" + "adds r8, r8, lr\n\t" + "adcs r9, r9, r12\n\t" + "adc r10, r12, r12\n\t" + "umlal r9, r10, r4, r5\n\t" + "adds %[key], %[key], %[key]\n\t" + "adcs r6, r6, r6\n\t" + "adcs r7, r7, r7\n\t" + "adcs r8, r8, r8\n\t" + "adcs r9, r9, r9\n\t" + "adcs r10, r10, r10\n\t" + "adc r11, r12, r12\n\t" + "umull %[ctx], lr, r2, r2\n\t" + "adds %[key], %[key], lr\n\t" + "adcs r6, r6, r12\n\t" + "adc lr, r12, r12\n\t" + "umlal r6, lr, r3, r3\n\t" + "adds r7, r7, lr\n\t" + "adcs r8, r8, r12\n\t" + "adc lr, r12, r12\n\t" + "umlal r8, lr, r4, r4\n\t" + "adds r9, r9, lr\n\t" + "adcs r10, r10, r12\n\t" + "adc r11, r11, r12\n\t" + "umlal r10, r11, r5, r5\n\t" +#else + "umull %[ctx], %[key], r2, r2\n\t" + "umull r6, r7, r2, r3\n\t" + "adds r6, r6, r6\n\t" + "mov r12, #0\n\t" + "umaal %[key], r6, r12, r12\n\t" + "mov r8, r12\n\t" + "umaal r8, r7, r2, r4\n\t" + "adcs r8, r8, r8\n\t" + "umaal r6, r8, r3, r3\n\t" + "umull r9, r10, r2, r5\n\t" + "umaal r7, r9, r3, r4\n\t" + "adcs r7, r7, r7\n\t" + "umaal r7, r8, r12, r12\n\t" + "umaal r10, r9, r3, r5\n\t" + "adcs r10, r10, r10\n\t" + "umaal r8, r10, r4, r4\n\t" + "mov r11, r12\n\t" + "umaal r9, r11, r4, r5\n\t" + "adcs r9, r9, r9\n\t" + "umaal r9, r10, r12, r12\n\t" + "adcs r11, r11, r11\n\t" + "umaal r10, r11, r5, r5\n\t" + "adc r11, r11, r12\n\t" +#endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */ + /* Reduce mod 2^130 - 5 */ + "bic r2, r8, #0x3\n\t" + "and r8, r8, #3\n\t" + "adds %[ctx], %[ctx], r2\n\t" + "lsr r2, r2, #2\n\t" + "adcs %[key], %[key], r9\n\t" + "orr r2, r2, r9, LSL #30\n\t" + "adcs r6, r6, r10\n\t" + "lsr r9, r9, #2\n\t" + "adcs r7, r7, r11\n\t" + "orr r9, r9, r10, LSL #30\n\t" + "adc r8, r8, r12\n\t" + "lsr r10, r10, #2\n\t" + "adds %[ctx], %[ctx], r2\n\t" + "orr r10, r10, r11, LSL #30\n\t" + "adcs %[key], %[key], r9\n\t" + "lsr r11, r11, #2\n\t" + "adcs r6, r6, r10\n\t" + "adcs r7, r7, r11\n\t" + "adc r8, r8, r12\n\t" + "lsr r3, %[ctx], #26\n\t" + "lsr r4, %[key], #20\n\t" + "lsr r5, r6, #14\n\t" + "lsr r10, r7, #8\n\t" + "eor r3, r3, %[key], lsl #6\n\t" + "eor r4, r4, r6, lsl #12\n\t" + "eor r5, r5, r7, lsl #18\n\t" + "eor r10, r10, r8, lsl #24\n\t" + "and r2, %[ctx], #0x3ffffff\n\t" + "and r3, r3, #0x3ffffff\n\t" + "and r4, r4, #0x3ffffff\n\t" + "and r5, r5, #0x3ffffff\n\t" + "vmov.i32 s0, r2\n\t" + "vmov.i32 s2, r3\n\t" + "vmov.i32 s4, r4\n\t" + "vmov.i32 s6, r5\n\t" + "vmov.i32 s8, r10\n\t" + "pop {%[ctx]-%[key]}\n\t" + "add lr, %[ctx], #0x7c\n\t" + "vstm.32 lr, {d0-d4}\n\t" + /* Multiply r^2, r by r^2 */ + "vshl.u32 d6, d1, #2\n\t" + "vshl.u32 d7, d2, #2\n\t" + "vshl.u32 d8, d3, #2\n\t" + "vshl.u32 d9, d4, #2\n\t" + "vadd.u32 d6, d6, d1\n\t" + "vadd.u32 d7, d7, d2\n\t" + "vadd.u32 d8, d8, d3\n\t" + "vadd.u32 d9, d9, d4\n\t" + "vmull.u32 q5, d0, d0[0]\n\t" + "vmull.u32 q6, d0, d1[0]\n\t" + "vmull.u32 q7, d0, d2[0]\n\t" + "vmull.u32 q8, d0, d3[0]\n\t" + "vmull.u32 q9, d0, d4[0]\n\t" + "vmlal.u32 q5, d1, d9[0]\n\t" + "vmlal.u32 q6, d1, d0[0]\n\t" + "vmlal.u32 q7, d1, d1[0]\n\t" + "vmlal.u32 q8, d1, d2[0]\n\t" + "vmlal.u32 q9, d1, d3[0]\n\t" + "vmlal.u32 q5, d2, d8[0]\n\t" + "vmlal.u32 q6, d2, d9[0]\n\t" + "vmlal.u32 q7, d2, d0[0]\n\t" + "vmlal.u32 q8, d2, d1[0]\n\t" + "vmlal.u32 q9, d2, d2[0]\n\t" + "vmlal.u32 q5, d3, d7[0]\n\t" + "vmlal.u32 q6, d3, d8[0]\n\t" + "vmlal.u32 q7, d3, d9[0]\n\t" + "vmlal.u32 q8, d3, d0[0]\n\t" + "vmlal.u32 q9, d3, d1[0]\n\t" + "vmlal.u32 q5, d4, d6[0]\n\t" + "vmlal.u32 q6, d4, d7[0]\n\t" + "vmlal.u32 q7, d4, d8[0]\n\t" + "vmlal.u32 q8, d4, d9[0]\n\t" + "vmlal.u32 q9, d4, d0[0]\n\t" + "vsra.u64 q6, q5, #26\n\t" + "vand.u64 q5, q5, q10\n\t" + "vsra.u64 q7, q6, #26\n\t" + "vand.u64 q6, q6, q10\n\t" + "vsra.u64 q8, q7, #26\n\t" + "vand.u64 q7, q7, q10\n\t" + "vsra.u64 q9, q8, #26\n\t" + "vand.u64 q8, q8, q10\n\t" + "vshr.u64 q3, q9, #26\n\t" + "vand.u64 q9, q9, q10\n\t" + "vadd.u64 q5, q5, q3\n\t" + "vshl.u64 q3, q3, #2\n\t" + "vadd.u64 q5, q5, q3\n\t" + "vsra.u64 q6, q5, #26\n\t" + "vand.u64 q5, q5, q10\n\t" + "vmovn.i64 d10, q5\n\t" + "vmovn.i64 d11, q6\n\t" + "vmovn.i64 d12, q7\n\t" + "vmovn.i64 d13, q8\n\t" + "vmovn.i64 d14, q9\n\t" + "add lr, %[ctx], #0xa4\n\t" + "vstm.32 lr, {d10-d14}\n\t" + /* h (accumulator) = 0 */ + "eor r6, r6, r6\n\t" + "eor r7, r7, r7\n\t" + "eor r8, r8, r8\n\t" + "eor r9, r9, r9\n\t" + "add lr, %[ctx], #16\n\t" + "eor r4, r4, r4\n\t" + "eor r5, r5, r5\n\t" + "stm lr, {r4, r5, r6, r7, r8, r9}\n\t" + /* Zero leftover */ + "str r5, [%[ctx], #56]\n\t" + : [ctx] "+r" (ctx), [key] "+r" (key), + [L_poly1305_arm32_clamp] "+r" (L_poly1305_arm32_clamp_c) + : + : "memory", "cc", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", + "r10", "r11", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", + "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", + "d19", "d20", "d21" + ); +} + +void poly1305_final(Poly1305* ctx_p, byte* mac_p) +{ + register Poly1305* ctx asm ("r0") = (Poly1305*)ctx_p; + register byte* mac asm ("r1") = (byte*)mac_p; + + __asm__ __volatile__ ( + "add r9, %[ctx], #16\n\t" + "ldm r9, {r4, r5, r6, r7, r8}\n\t" + /* Add 5 and check for h larger than p. */ + "adds r2, r4, #5\n\t" + "adcs r2, r5, #0\n\t" + "adcs r2, r6, #0\n\t" + "adcs r2, r7, #0\n\t" + "adc r2, r8, #0\n\t" + "sub r2, r2, #4\n\t" + "lsr r2, r2, #31\n\t" + "sub r2, r2, #1\n\t" + "and r2, r2, #5\n\t" + /* Add 0/5 to h. */ + "adds r4, r4, r2\n\t" + "adcs r5, r5, #0\n\t" + "adcs r6, r6, #0\n\t" + "adc r7, r7, #0\n\t" + /* Add padding */ + "add r9, %[ctx], #40\n\t" + "ldm r9, {r2, r3, r12, lr}\n\t" + "adds r4, r4, r2\n\t" + "adcs r5, r5, r3\n\t" + "adcs r6, r6, r12\n\t" + "adc r7, r7, lr\n\t" + /* Store MAC */ + "str r4, [%[mac]]\n\t" + "str r5, [%[mac], #4]\n\t" + "str r6, [%[mac], #8]\n\t" + "str r7, [%[mac], #12]\n\t" + /* Zero out h. */ + "eor r4, r4, r4\n\t" + "eor r5, r5, r5\n\t" + "eor r6, r6, r6\n\t" + "eor r7, r7, r7\n\t" + "eor r8, r8, r8\n\t" + "add r9, %[ctx], #16\n\t" + "stm r9, {r4, r5, r6, r7, r8}\n\t" + /* Zero out r. */ + "add r9, %[ctx], #0\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + /* Zero out padding. */ + "add r9, %[ctx], #40\n\t" + "stm r9, {r4, r5, r6, r7}\n\t" + : [ctx] "+r" (ctx), [mac] "+r" (mac) + : + : "memory", "cc", "r2", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", + "r9" + ); +} + +#endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* HAVE_POLY1305 */ #endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ #endif /* WOLFSSL_ARMASM */ diff --git a/wolfcrypt/src/port/arm/armv8-poly1305.c b/wolfcrypt/src/port/arm/armv8-poly1305.c index fc0c39e638..b3d9df0bc0 100644 --- a/wolfcrypt/src/port/arm/armv8-poly1305.c +++ b/wolfcrypt/src/port/arm/armv8-poly1305.c @@ -1150,7 +1150,11 @@ void poly1305_block_thumb2(Poly1305* ctx, const unsigned char* m) */ void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes) { - poly1305_blocks_arm32_16(ctx, m, bytes, 1); +#ifndef WOLFSSL_ARMASM_NO_NEON + poly1305_arm32_blocks(ctx, m, bytes); +#else + poly1305_arm32_blocks_16(ctx, m, bytes, 1); +#endif } /* Process 16 bytes of message. @@ -1160,7 +1164,7 @@ void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char* m, size_t bytes) */ void poly1305_block_arm32(Poly1305* ctx, const unsigned char* m) { - poly1305_blocks_arm32_16(ctx, m, POLY1305_BLOCK_SIZE, 1); + poly1305_arm32_blocks_16(ctx, m, POLY1305_BLOCK_SIZE, 1); } #endif @@ -1219,6 +1223,16 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) /* Process the remaining partial block - last block. */ if (ret == 0) { + #if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) + if (ctx->leftover >= POLY1305_BLOCK_SIZE) { + size_t len = ctx->leftover & (~(POLY1305_BLOCK_SIZE - 1)); + poly1305_arm32_blocks(ctx, ctx->buffer, len); + ctx->leftover -= len; + if (ctx->leftover) { + XMEMCPY(ctx->buffer, ctx->buffer + len, ctx->leftover); + } + } + #endif if (ctx->leftover) { size_t i = ctx->leftover; ctx->buffer[i++] = 1; @@ -1229,7 +1243,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac) poly1305_blocks_thumb2_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0); #else - poly1305_blocks_arm32_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0); + poly1305_arm32_blocks_16(ctx, ctx->buffer, POLY1305_BLOCK_SIZE, 0); #endif } diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 9a1d501d85..4d1afc02ca 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -8201,6 +8201,31 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t poly1305_test(void) return WC_TEST_RET_ENC_I(i); } + /* Testing multiple updates with various sizes works. */ + for (i = 1; i < (int)sizeof(msg6); i++) { + int j; + + ret = wc_Poly1305SetKey(&enc, key, 32); + if (ret != 0) + return WC_TEST_RET_ENC_I(i); + + for (j = 0; j < (int)sizeof(msg6); j += i) { + int len = (int)sizeof(msg6) - j; + if (len > i) + len = i; + ret = wc_Poly1305Update(&enc, msg6 + j, len); + if (ret != 0) + return WC_TEST_RET_ENC_I(j); + } + + ret = wc_Poly1305Final(&enc, tag); + if (ret != 0) + return WC_TEST_RET_ENC_I(i); + + if (XMEMCMP(tag, correct6, sizeof(tag))) + return WC_TEST_RET_ENC_I(i); + } + /* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */ XMEMSET(tag, 0, sizeof(tag)); ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4)); diff --git a/wolfssl/wolfcrypt/poly1305.h b/wolfssl/wolfcrypt/poly1305.h index d4db48762e..dfb349b777 100644 --- a/wolfssl/wolfcrypt/poly1305.h +++ b/wolfssl/wolfcrypt/poly1305.h @@ -100,10 +100,20 @@ typedef struct Poly1305 { unsigned char finished; #elif defined(WOLFSSL_ARMASM) word32 r[4]; +#if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) + word32 h[6]; +#else word32 h[5]; +#endif word32 pad[4]; word32 leftover; +#if !defined(WOLFSSL_ARMASM_THUMB2) && !defined(WOLFSSL_ARMASM_NO_NEON) + unsigned char buffer[4*POLY1305_BLOCK_SIZE]; + word32 r_21[10]; + word32 r_43[10]; +#else unsigned char buffer[POLY1305_BLOCK_SIZE]; +#endif #elif defined(WOLFSSL_RISCV_ASM) word64 r[2]; #ifdef WOLFSSL_RISCV_VECTOR @@ -173,7 +183,8 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const unsigned char* m, void poly1305_blocks_arm32(Poly1305* ctx, const unsigned char *m, size_t bytes); void poly1305_block_arm32(Poly1305* ctx, const unsigned char *m); -void poly1305_blocks_arm32_16(Poly1305* ctx, const unsigned char* m, word32 len, +void poly1305_arm32_blocks(Poly1305* ctx, const unsigned char* m, word32 len); +void poly1305_arm32_blocks_16(Poly1305* ctx, const unsigned char* m, word32 len, int notLast); #endif void poly1305_set_key(Poly1305* ctx, const byte* key);