Skip to content

Commit

Permalink
[AArch64] Add custom lowering for load <3 x i8>. (llvm#78632)
Browse files Browse the repository at this point in the history
Add custom combine to lower load <3 x i8> as the more efficient sequence
below:
   ldrb wX, [x0, #2]
   ldrh wY, [x0]
   orr wX, wY, wX, lsl llvm#16
   fmov s0, wX

At the moment, there are almost no cases in which such vector operations
will be generated automatically. The motivating case is non-power-of-2
SLP vectorization: llvm#77790
  • Loading branch information
fhahn authored Jan 30, 2024
1 parent 748c295 commit d1e162e
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 114 deletions.
65 changes: 63 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21248,6 +21248,61 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
return SDValue();
}

// A custom combine to lower load <3 x i8> as the more efficient sequence
// below:
// ldrb wX, [x0, #2]
// ldrh wY, [x0]
// orr wX, wY, wX, lsl #16
// fmov s0, wX
//
// Note that an alternative sequence with even fewer (although usually more
// complex/expensive) instructions would be:
// ld1r.4h { v0 }, [x0], #2
// ld1.b { v0 }[2], [x0]
//
// Generating this sequence unfortunately results in noticeably worse codegen
// for code that extends the loaded v3i8, due to legalization breaking vector
// shuffle detection in a way that is very difficult to work around.
// TODO: Revisit once v3i8 legalization has been improved in general.
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
EVT MemVT = LD->getMemoryVT();
if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
LD->getOriginalAlign() >= 4)
return SDValue();

SDLoc DL(LD);
MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
MachineMemOperand *MMO = LD->getMemOperand();
assert(LD->getOffset().isUndef() && "undef offset expected");

// Load 2 x i8, then 1 x i8.
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
TypeSize Offset2 = TypeSize::getFixed(2);
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
MF.getMachineMemOperand(MMO, 2, 1));

// Extend to i32.
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);

// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
DAG.getConstant(16, DL, MVT::i32));
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);

// Extract v3i8 again.
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
DAG.getConstant(0, DL, MVT::i64));
SDValue TokenFactor = DAG.getNode(
ISD::TokenFactor, DL, MVT::Other,
{SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
return DAG.getMergeValues({Extract, TokenFactor}, DL);
}

// Perform TBI simplification if supported by the target and try to break up
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
// load instructions can be selected.
Expand All @@ -21259,10 +21314,16 @@ static SDValue performLOADCombine(SDNode *N,
performTBISimplification(N->getOperand(1), DCI, DAG);

LoadSDNode *LD = cast<LoadSDNode>(N);
EVT MemVT = LD->getMemoryVT();
if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
if (LD->isVolatile() || !Subtarget->isLittleEndian())
return SDValue(N, 0);

if (SDValue Res = combineV3I8LoadExt(LD, DAG))
return Res;

if (!LD->isNonTemporal())
return SDValue(N, 0);

EVT MemVT = LD->getMemoryVT();
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
MemVT.getSizeInBits() % 256 == 0 ||
256 % MemVT.getScalarSizeInBits() != 0)
Expand Down
173 changes: 61 additions & 112 deletions llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,10 @@
define <16 x i8> @load_v3i8(ptr %src) {
; CHECK-LABEL: load_v3i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: umov.h w8, v0[0]
; CHECK-NEXT: umov.h w9, v0[1]
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: mov.b v0[1], w9
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8:
Expand Down Expand Up @@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ldrsb w8, [x0, #2]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: mov.h v0[1], v0[1]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32:
Expand Down Expand Up @@ -90,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ldrsb w8, [x0, #2]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: mov.h v0[1], v0[1]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32_align_2:
Expand Down Expand Up @@ -160,19 +141,14 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldurh w8, [x0, #1]
; CHECK-NEXT: ldrb w8, [x0, #3]
; CHECK-NEXT: ldurh w9, [x0, #1]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ldrsb w8, [x0, #3]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: mov.h v0[1], v0[1]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32_const_offset_1:
Expand Down Expand Up @@ -204,19 +180,14 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldurh w8, [x0, #3]
; CHECK-NEXT: ldrb w8, [x0, #5]
; CHECK-NEXT: ldurh w9, [x0, #3]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ldrsb w8, [x0, #5]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: mov.h v0[1], v0[1]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_to_4xi32_const_offset_3:
Expand Down Expand Up @@ -348,18 +319,14 @@ define <3 x i32> @load_v3i32(ptr %src) {
define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
; CHECK-LABEL: load_v3i8_zext_to_3xi32:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_zext_to_3xi32:
Expand Down Expand Up @@ -388,18 +355,14 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) {
; CHECK-LABEL: load_v3i8_sext_to_3xi32:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: shl.4s v0, v0, #24
; CHECK-NEXT: sshr.4s v0, v0, #24
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_v3i8_sext_to_3xi32:
Expand Down Expand Up @@ -513,19 +476,15 @@ entry:
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_to_64bits:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: strh w8, [sp, #12]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
; CHECK-NEXT: ldrb w8, [x0, #2]
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: orr w8, w9, w8, lsl #16
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
; BE-LABEL: load_ext_to_64bits:
Expand Down Expand Up @@ -614,24 +573,20 @@ entry:
define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: load_ext_add_to_64bits:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ldrb w9, [x0, #2]
; CHECK-NEXT: ldrh w10, [x0]
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x8, lCPI15_0@PAGE
; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr d1, [x8, lCPI15_0@PAGEOFF]
; CHECK-NEXT: add x8, x1, #4
; CHECK-NEXT: strh w9, [sp, #12]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ld1.b { v0 }[4], [x9]
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: bic.4h v0, #255, lsl #8
; CHECK-NEXT: add.4h v0, v0, v1
; CHECK-NEXT: st1.h { v0 }[2], [x8]
; CHECK-NEXT: str s0, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
;
Expand Down Expand Up @@ -880,24 +835,21 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-LABEL: load_v3i8_zext_to_3xi32_add_trunc_store:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ldrb w9, [x0, #2]
; CHECK-NEXT: ldrh w10, [x0]
; CHECK-NEXT: Lloh4:
; CHECK-NEXT: adrp x8, lCPI22_0@PAGE
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q1, [x8, lCPI22_0@PAGEOFF]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: strh w9, [sp, #12]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ld1.b { v0 }[4], [x9]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: st1.b { v0 }[8], [x9]
; CHECK-NEXT: st1.b { v0 }[8], [x8]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: st1.b { v0 }[0], [x0]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
;
Expand Down Expand Up @@ -936,24 +888,21 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-LABEL: load_v3i8_sext_to_3xi32_add_trunc_store:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: ldrb w9, [x0, #2]
; CHECK-NEXT: ldrh w10, [x0]
; CHECK-NEXT: Lloh6:
; CHECK-NEXT: adrp x8, lCPI23_0@PAGE
; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q1, [x8, lCPI23_0@PAGEOFF]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: strh w9, [sp, #12]
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ld1.b { v0 }[4], [x9]
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: st1.b { v0 }[8], [x9]
; CHECK-NEXT: st1.b { v0 }[8], [x8]
; CHECK-NEXT: add x8, x0, #1
; CHECK-NEXT: st1.b { v0 }[0], [x0]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
;
Expand Down

0 comments on commit d1e162e

Please sign in to comment.