From 9cae4efd07d6b3482a657ea0eb1f96e9e23b68a2 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 9 Sep 2024 14:06:51 +0100 Subject: [PATCH] [Compiler-rt] Add AArch64 routines for __arm_agnostic("sme_za_state") The specification of these routines can be found here: https://github.com/ARM-software/abi-aa/pull/264 --- compiler-rt/cmake/builtin-config-ix.cmake | 3 +- .../lib/builtins/aarch64/sme-abi-init.c | 5 + compiler-rt/lib/builtins/aarch64/sme-abi.S | 150 +++++++++++++++++- 3 files changed, 155 insertions(+), 3 deletions(-) diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake index 1f63e158409ca4..706a1ff7eeb6db 100644 --- a/compiler-rt/cmake/builtin-config-ix.cmake +++ b/compiler-rt/cmake/builtin-config-ix.cmake @@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\"); builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME " void foo(void) __arm_streaming_compatible { - asm(\".arch armv9-a+sme\"); + asm(\".arch armv9-a+sme2\"); asm(\"smstart\"); + asm(\"ldr zt0, [sp]\"); } ") diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-init.c b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c index d3cd8278a5d214..ff11e1be5e8465 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi-init.c +++ b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c @@ -48,3 +48,8 @@ __attribute__((constructor(90))) static void init_aarch64_has_sme(void) { __aarch64_has_sme_and_tpidr2_el0 = has_sme(); } + +#include "../cpu_model/AArch64CPUFeatures.inc" +_Static_assert(FEAT_SVE== 30, "sme-abi.S assumes FEAT_SVE = 30"); +_Static_assert(FEAT_SME== 42, "sme-abi.S assumes FEAT_SME = 42"); +_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57"); diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S index 3e9bd2c23b2fc0..404bc5e7d89af7 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-abi.S +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -8,6 +8,10 @@ #include "../assembly.h" +#define FEAT_SVE_BIT 30 +#define FEAT_SME_BIT 42 +#define FEAT_SME2_BIT 57 +#define FEAT_SME2_MASK 0x200000000000000 #if !defined(__APPLE__) #define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0) @@ -23,7 +27,7 @@ #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff #endif -.arch armv9-a+sme +.arch armv9-a+sme2 // Utility function which calls a system's abort() routine. Because the function // is streaming-compatible it should disable streaming-SVE mode before calling @@ -196,7 +200,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg) .cfi_offset w29, -16 adrp x17, CPU_FEATS_SYMBOL ldr w17, [x17, CPU_FEATS_SYMBOL_OFFSET] - tbnz w17, #30, 0f + tbnz w17, #FEAT_SVE_BIT, 0f adrp x16, TPIDR2_SYMBOL ldrb w16, [x16, TPIDR2_SYMBOL_OFFSET] cbz w16, 1f @@ -224,6 +228,148 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg) ret END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg) +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state_size) + .variant_pcs __arm_sme_state_size + BTI_C + + // Test if SME is available and PSTATE = 1. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME_BIT, 0f + mrs x16, SVCR + tbz x16, #1, 0f + + // Size = HAS_FEAT_SME2 ? 32 : 96 + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tst x16, #FEAT_SME2_MASK + mov w17, #32 + mov w16, #96 + csel x16, x17, x16, eq + + // Size = Size + (SVLB * SVLB) + rdsvl x17, #1 + madd x0, x17, x17, x16 + ret + +0: + // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes) + mov w0, #16 + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state_size) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_save) + .variant_pcs __arm_sme_save + BTI_C + + // Clear internal state bits + stp xzr, xzr, [x0] + + // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME_BIT, 0f + mrs x16, SVCR + tbz x16, #1, 0f + mrs x16, TPIDR2_EL0 + cbnz x16, 0f + + # ZA or ZT0 need saving, we can now set internal VALID bit to 1 + mov w16, #1 + str x16, [x0] + + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME2_BIT, 2f + + // Store ZT0 and ZA + add x16, x0, #32 + str zt0, [x16] + add x18, x0, #96 + b 3f + +2: + // Has SME only + add x18, x0, #32 + +3: + // Set up lazy-save (x18 = pointer to buffer) + rdsvl x17, #1 + str x18, [x0, #16]! + strh w17, [x0, #8] + stur wzr, [x0, #10] + strh wzr, [x0, #14] + msr TPIDR2_EL0, x0 + ret + +0: + // Do nothing + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_save) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_restore) + .variant_pcs __arm_sme_save + BTI_C + + stp x29, x30, [sp, #-16]! + .cfi_def_cfa_offset 16 + mov x29, sp + .cfi_def_cfa w29, 16 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + + // If the VALID bit is 0, return early. + ldr x16, [x0] + tbz x16, #0, 2f + + // If SME is not available, abort. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME_BIT, 3f + + // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0. + mrs x16, TPIDR2_EL0 + cbnz x16, 0f + + // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'), + // abort. + mrs x16, SVCR + tbnz x16, #1, 3f + + // Restore za. + smstart za + mov x16, x0 + add x0, x0, #16 + bl __arm_tpidr2_restore + mov x0, x16 + msr TPIDR2_EL0, xzr + +0: + smstart za + +1: + // Check if zt0 needs restoring. + adrp x16, CPU_FEATS_SYMBOL + ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET] + tbz x16, #FEAT_SME2_BIT, 2f + + // Restore zt0. + add x16, x0, #32 + ldr zt0, [x16] + +2: + // Do nothing + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 + ret + +3: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_restore) + NO_EXEC_STACK_DIRECTIVE // GNU property note for BTI and PAC