Skip to content

Commit

Permalink
[InstCombine] canonicalize shifty abs(): ashr+add+xor --> cmp+neg+sel
Browse files Browse the repository at this point in the history
We want to do this for 2 reasons:
1. Value tracking does not recognize the ashr variant, so it would fail to match for cases like D39766.
2. DAGCombiner does better at producing optimal codegen when we have the cmp+sel pattern.

More detail about what happens in the backend:
1. DAGCombiner has a generic transform for all targets to convert the scalar cmp+sel variant of abs
   into the shift variant. That is the opposite of this IR canonicalization.
2. DAGCombiner has a generic transform for all targets to convert the vector cmp+sel variant of abs
   into either an ABS node or the shift variant. That is again the opposite of this IR canonicalization.
3. DAGCombiner has a generic transform for all targets to convert the exact shift variants produced by chapuni#1 or chapuni#2
   into an ISD::ABS node. Note: It would be an efficiency improvement if we had chapuni#1 go directly to an ABS node
   when that's legal/custom.
4. The pattern matching above is incomplete, so it is possible to escape the intended/optimal codegen in a
   variety of ways.
   a. For chapuni#2, the vector path is missing the case for setlt with a '1' constant.
   b. For chapuni#3, we are missing a match for commuted versions of the shift variants.
5. Therefore, this IR canonicalization can only help get us to the optimal codegen. The version of cmp+sel
   produced by this patch will be recognized in the DAG and converted to an ABS node when possible or the
   shift sequence when not.
6. In the following examples with this patch applied, we may get conditional moves rather than the shift
   produced by the generic DAGCombiner transforms. The conditional move is created using a target-specific
   decision for any given target. Whether it is optimal or not for a particular subtarget may be up for debate.

define i32 @abs_shifty(i32 %x) {
  %signbit = ashr i32 %x, 31
  %add = add i32 %signbit, %x
  %abs = xor i32 %signbit, %add
  ret i32 %abs
}

define i32 @abs_cmpsubsel(i32 %x) {
  %cmp = icmp slt i32 %x, zeroinitializer
  %sub = sub i32 zeroinitializer, %x
  %abs = select i1 %cmp, i32 %sub, i32 %x
  ret i32 %abs
}

define <4 x i32> @abs_shifty_vec(<4 x i32> %x) {
  %signbit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
  %add = add <4 x i32> %signbit, %x
  %abs = xor <4 x i32> %signbit, %add
  ret <4 x i32> %abs
}

define <4 x i32> @abs_cmpsubsel_vec(<4 x i32> %x) {
  %cmp = icmp slt <4 x i32> %x, zeroinitializer
  %sub = sub <4 x i32> zeroinitializer, %x
  %abs = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %x
  ret <4 x i32> %abs
}

> $ ./opt -instcombine shiftyabs.ll -S | ./llc -o - -mtriple=x86_64 -mattr=avx
> abs_shifty:
> 	movl	%edi, %eax
> 	negl	%eax
> 	cmovll	%edi, %eax
> 	retq
>
> abs_cmpsubsel:
> 	movl	%edi, %eax
> 	negl	%eax
> 	cmovll	%edi, %eax
> 	retq
>
> abs_shifty_vec:
> 	vpabsd	%xmm0, %xmm0
> 	retq
>
> abs_cmpsubsel_vec:
> 	vpabsd	%xmm0, %xmm0
> 	retq
>
> $ ./opt -instcombine shiftyabs.ll -S | ./llc -o - -mtriple=aarch64
> abs_shifty:
> 	cmp	w0, #0                  // =0
> 	cneg	w0, w0, mi
> 	ret
>
> abs_cmpsubsel:
> 	cmp	w0, #0                  // =0
> 	cneg	w0, w0, mi
> 	ret
>
> abs_shifty_vec:
> 	abs	v0.4s, v0.4s
> 	ret
>
> abs_cmpsubsel_vec:
> 	abs	v0.4s, v0.4s
> 	ret
>
> $ ./opt -instcombine shiftyabs.ll -S | ./llc -o - -mtriple=powerpc64le
> abs_shifty:
> 	srawi 4, 3, 31
> 	add 3, 3, 4
> 	xor 3, 3, 4
> 	blr
>
> abs_cmpsubsel:
> 	srawi 4, 3, 31
> 	add 3, 3, 4
> 	xor 3, 3, 4
> 	blr
>
> abs_shifty_vec:
> 	vspltisw 3, -16
> 	vspltisw 4, 15
> 	vsubuwm 3, 4, 3
> 	vsraw 3, 2, 3
> 	vadduwm 2, 2, 3
> 	xxlxor 34, 34, 35
> 	blr
>
> abs_cmpsubsel_vec:
> 	vspltisw 3, -16
> 	vspltisw 4, 15
> 	vsubuwm 3, 4, 3
> 	vsraw 3, 2, 3
> 	vadduwm 2, 2, 3
> 	xxlxor 34, 34, 35
> 	blr
>

Differential Revision: https://reviews.llvm.org/D40984
  • Loading branch information
rotateright committed Dec 16, 2017
1 parent ec7cac8 commit e005e3c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 12 deletions.
20 changes: 20 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2397,5 +2397,25 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
return CastedXor;

// Canonicalize the shifty way to code absolute value to the common pattern.
// There are 4 potential commuted variants. Move the 'ashr' candidate to Op1.
// We're relying on the fact that we only do this transform when the shift has
// exactly 2 uses and the add has exactly 1 use (otherwise, we might increase
// instructions).
if (Op0->getNumUses() == 2)
std::swap(Op0, Op1);

const APInt *ShAmt;
Type *Ty = I.getType();
if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
Op1->getNumUses() == 2 && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
// B = ashr i32 A, 31 ; smear the sign bit
// xor (add A, B), B ; add -1 and flip bits if negative
// --> (A < 0) ? -A : A
Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
return SelectInst::Create(Cmp, Builder.CreateNeg(A), A);
}

return Changed ? &I : nullptr;
}
24 changes: 12 additions & 12 deletions llvm/test/Transforms/InstCombine/abs-1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ define i64 @test_llabs(i64 %x) {

define i8 @shifty_abs_commute0(i8 %x) {
; CHECK-LABEL: @shifty_abs_commute0(
; CHECK-NEXT: [[SIGNBIT:%.*]] = ashr i8 %x, 7
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SIGNBIT]], %x
; CHECK-NEXT: [[ABS:%.*]] = xor i8 [[ADD]], [[SIGNBIT]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 %x, 0
; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, %x
; CHECK-NEXT: [[ABS:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 %x
; CHECK-NEXT: ret i8 [[ABS]]
;
%signbit = ashr i8 %x, 7
Expand All @@ -61,9 +61,9 @@ define i8 @shifty_abs_commute0(i8 %x) {

define <2 x i8> @shifty_abs_commute1(<2 x i8> %x) {
; CHECK-LABEL: @shifty_abs_commute1(
; CHECK-NEXT: [[SIGNBIT:%.*]] = ashr <2 x i8> %x, <i8 7, i8 7>
; CHECK-NEXT: [[ADD:%.*]] = add <2 x i8> [[SIGNBIT]], %x
; CHECK-NEXT: [[ABS:%.*]] = xor <2 x i8> [[SIGNBIT]], [[ADD]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i8> %x, zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i8> zeroinitializer, %x
; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[TMP1]], <2 x i8> [[TMP2]], <2 x i8> %x
; CHECK-NEXT: ret <2 x i8> [[ABS]]
;
%signbit = ashr <2 x i8> %x, <i8 7, i8 7>
Expand All @@ -75,9 +75,9 @@ define <2 x i8> @shifty_abs_commute1(<2 x i8> %x) {
define <2 x i8> @shifty_abs_commute2(<2 x i8> %x) {
; CHECK-LABEL: @shifty_abs_commute2(
; CHECK-NEXT: [[Y:%.*]] = mul <2 x i8> %x, <i8 3, i8 3>
; CHECK-NEXT: [[SIGNBIT:%.*]] = ashr <2 x i8> [[Y]], <i8 7, i8 7>
; CHECK-NEXT: [[ADD:%.*]] = add <2 x i8> [[Y]], [[SIGNBIT]]
; CHECK-NEXT: [[ABS:%.*]] = xor <2 x i8> [[SIGNBIT]], [[ADD]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <2 x i8> [[Y]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i8> zeroinitializer, [[Y]]
; CHECK-NEXT: [[ABS:%.*]] = select <2 x i1> [[TMP1]], <2 x i8> [[TMP2]], <2 x i8> [[Y]]
; CHECK-NEXT: ret <2 x i8> [[ABS]]
;
%y = mul <2 x i8> %x, <i8 3, i8 3> ; extra op to thwart complexity-based canonicalization
Expand All @@ -90,9 +90,9 @@ define <2 x i8> @shifty_abs_commute2(<2 x i8> %x) {
define i8 @shifty_abs_commute3(i8 %x) {
; CHECK-LABEL: @shifty_abs_commute3(
; CHECK-NEXT: [[Y:%.*]] = mul i8 %x, 3
; CHECK-NEXT: [[SIGNBIT:%.*]] = ashr i8 [[Y]], 7
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[Y]], [[SIGNBIT]]
; CHECK-NEXT: [[ABS:%.*]] = xor i8 [[ADD]], [[SIGNBIT]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i8 [[Y]], 0
; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[Y]]
; CHECK-NEXT: [[ABS:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 [[Y]]
; CHECK-NEXT: ret i8 [[ABS]]
;
%y = mul i8 %x, 3 ; extra op to thwart complexity-based canonicalization
Expand Down

0 comments on commit e005e3c

Please sign in to comment.