From d240aa2db5e3ea12e22a4f3f5d4dcaa3162c4e73 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 24 Nov 2024 19:18:11 +0000 Subject: [PATCH] Fix: Avoid AVX-512BW on Skylake-X Previous AVX-512 implementation of complex products used an extra ZMM register for `swap_adjacent_vec`. Moreover, they used the `vpshufb` instruction available only with the Ice Lake capability and newer. The replacement uses the `_mm512_permute_ps` and its double-precision variant. --- include/simsimd/dot.h | 56 +++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h index 21445b50..7a005840 100644 --- a/include/simsimd/dot.h +++ b/include/simsimd/dot.h @@ -1332,13 +1332,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32_t const *a, simsimd_f32 // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set1_epi64(0x8000000000000000); - __m512i swap_adjacent_vec = _mm512_set_epi8( // - 59, 58, 57, 56, 63, 62, 61, 60, 51, 50, 49, 48, 55, 54, 53, 52, // 4th 128-bit lane - 43, 42, 41, 40, 47, 46, 45, 44, 35, 34, 33, 32, 39, 38, 37, 36, // 3rd 128-bit lane - 27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, // 2nd 128-bit lane - 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 // 1st 128-bit lane - ); + __m512i const sign_flip_vec = _mm512_set1_epi64(0x8000000000000000); simsimd_dot_f32c_skylake_cycle: if (n < 16) { __mmask16 mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); @@ -1352,8 +1346,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_skylake(simsimd_f32_t const *a, simsimd_f32 a += 16, b += 16, n -= 16; } ab_real_vec = _mm512_fmadd_ps(b_vec, a_vec, ab_real_vec); - ab_imag_vec = _mm512_fmadd_ps( - _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(b_vec), swap_adjacent_vec)), a_vec, ab_imag_vec); + b_vec = _mm512_permute_ps(b_vec, 0xB1); //? Swap adjacent entries within each pair + ab_imag_vec = _mm512_fmadd_ps(b_vec, a_vec, ab_imag_vec); if (n) goto simsimd_dot_f32c_skylake_cycle; // Flip the sign bit in every second scalar before accumulation: @@ -1376,8 +1370,8 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32_t const *a, simsimd_f3 // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set1_epi64(0x8000000000000000); - __m512i swap_adjacent_vec = _mm512_set_epi8( // + __m512i const sign_flip_vec = _mm512_set1_epi64(0x8000000000000000); + __m512i const swap_adjacent_vec = _mm512_set_epi8( // 59, 58, 57, 56, 63, 62, 61, 60, 51, 50, 49, 48, 55, 54, 53, 52, // 4th 128-bit lane 43, 42, 41, 40, 47, 46, 45, 44, 35, 34, 33, 32, 39, 38, 37, 36, // 3rd 128-bit lane 27, 26, 25, 24, 31, 30, 29, 28, 19, 18, 17, 16, 23, 22, 21, 20, // 2nd 128-bit lane @@ -1396,7 +1390,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_skylake(simsimd_f32_t const *a, simsimd_f3 a += 16, b += 16, n -= 16; } ab_real_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_real_vec); - b_vec = _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(b_vec), swap_adjacent_vec)); + b_vec = _mm512_permute_ps(b_vec, 0xB1); //? Swap adjacent entries within each pair ab_imag_vec = _mm512_fmadd_ps(a_vec, b_vec, ab_imag_vec); if (n) goto simsimd_vdot_f32c_skylake_cycle; @@ -1420,16 +1414,10 @@ SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64_t const *a, simsimd_f64 // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set_epi64( // + __m512i const sign_flip_vec = _mm512_set_epi64( // 0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, // 0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000 // ); - __m512i swap_adjacent_vec = _mm512_set_epi8( // - 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56, // 4th 128-bit lane - 39, 38, 37, 36, 35, 34, 33, 32, 47, 46, 45, 44, 43, 42, 41, 40, // 3rd 128-bit lane - 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, // 2nd 128-bit lane - 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 // 1st 128-bit lane - ); simsimd_dot_f64c_skylake_cycle: if (n < 8) { __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n); @@ -1443,8 +1431,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f64c_skylake(simsimd_f64_t const *a, simsimd_f64 a += 8, b += 8, n -= 8; } ab_real_vec = _mm512_fmadd_pd(b_vec, a_vec, ab_real_vec); - ab_imag_vec = _mm512_fmadd_pd( - _mm512_castsi512_pd(_mm512_shuffle_epi8(_mm512_castpd_si512(b_vec), swap_adjacent_vec)), a_vec, ab_imag_vec); + b_vec = _mm512_permute_pd(b_vec, 0xAA); //? Same as 0b10101010. + ab_imag_vec = _mm512_fmadd_pd(b_vec, a_vec, ab_imag_vec); if (n) goto simsimd_dot_f64c_skylake_cycle; // Flip the sign bit in every second scalar before accumulation: @@ -1467,16 +1455,10 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64_t const *a, simsimd_f6 // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set_epi64( // + __m512i const sign_flip_vec = _mm512_set_epi64( // 0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000, // 0x8000000000000000, 0x0000000000000000, 0x8000000000000000, 0x0000000000000000 // ); - __m512i swap_adjacent_vec = _mm512_set_epi8( // - 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56, // 4th 128-bit lane - 39, 38, 37, 36, 35, 34, 33, 32, 47, 46, 45, 44, 43, 42, 41, 40, // 3rd 128-bit lane - 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24, // 2nd 128-bit lane - 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 // 1st 128-bit lane - ); simsimd_vdot_f64c_skylake_cycle: if (n < 8) { __mmask8 mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n); @@ -1490,7 +1472,7 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c_skylake(simsimd_f64_t const *a, simsimd_f6 a += 8, b += 8, n -= 8; } ab_real_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_real_vec); - b_vec = _mm512_castsi512_pd(_mm512_shuffle_epi8(_mm512_castpd_si512(b_vec), swap_adjacent_vec)); + b_vec = _mm512_permute_pd(b_vec, 0xAA); //? Same as 0b10101010. ab_imag_vec = _mm512_fmadd_pd(a_vec, b_vec, ab_imag_vec); if (n) goto simsimd_vdot_f64c_skylake_cycle; @@ -1547,8 +1529,8 @@ SIMSIMD_PUBLIC void simsimd_dot_bf16c_genoa(simsimd_bf16_t const *a, simsimd_bf1 // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set1_epi32(0x80000000); - __m512i swap_adjacent_vec = _mm512_set_epi8( // + __m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000); + __m512i const swap_adjacent_vec = _mm512_set_epi8( // 61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane 45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane 29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane @@ -1589,8 +1571,8 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16_t const *a, simsimd_bf // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set1_epi32(0x80000000); - __m512i swap_adjacent_vec = _mm512_set_epi8( // + __m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000); + __m512i const swap_adjacent_vec = _mm512_set_epi8( // 61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane 45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane 29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane @@ -1665,8 +1647,8 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_sapphire(simsimd_f16_t const *a, simsimd_f1 // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set1_epi32(0x80000000); - __m512i swap_adjacent_vec = _mm512_set_epi8( // + __m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000); + __m512i const swap_adjacent_vec = _mm512_set_epi8( // 61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane 45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane 29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane @@ -1710,8 +1692,8 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_sapphire(simsimd_f16_t const *a, simsimd_f // This way we can avoid the shuffling and the need for separate real and imaginary parts. // For the imaginary part of the product, we would need to swap the real and imaginary parts of // one of the vectors. - __m512i sign_flip_vec = _mm512_set1_epi32(0x80000000); - __m512i swap_adjacent_vec = _mm512_set_epi8( // + __m512i const sign_flip_vec = _mm512_set1_epi32(0x80000000); + __m512i const swap_adjacent_vec = _mm512_set_epi8( // 61, 60, 63, 62, 57, 56, 59, 58, 53, 52, 55, 54, 49, 48, 51, 50, // 4th 128-bit lane 45, 44, 47, 46, 41, 40, 43, 42, 37, 36, 39, 38, 33, 32, 35, 34, // 3rd 128-bit lane 29, 28, 31, 30, 25, 24, 27, 26, 21, 20, 23, 22, 17, 16, 19, 18, // 2nd 128-bit lane