Skip to content

Commit

Permalink
[SLP]Attempt to vectorize long stores, if short one failed.
Browse files Browse the repository at this point in the history
We can try to vectorize long store sequences, if short ones were
unsuccessful because of the non-profitable vectorization. It should not
increase compile time significantly (stores are sorted already,
complexity is n x log n), but vectorize extra code.

Metric: size..text

Program                                                                         size..text
                                                                                results     results0    diff
         test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  1088012.00  1088236.00  0.0%
                  test-suite :: SingleSource/UnitTests/matrix-types-spec.test   480396.00   480476.00  0.0%
          test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   664613.00   664661.00  0.0%
         test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   664613.00   664661.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test  2041105.00  2040961.00 -0.0%
                 test-suite :: MultiSource/Applications/JM/lencod/lencod.test   836563.00   836387.00 -0.0%
                 test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  1035100.00  1032140.00 -0.3%

In all benchmarks extra code gets vectorized

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: llvm#88563
  • Loading branch information
alexey-bataev authored Apr 16, 2024
1 parent c7657cf commit 7d4e8c1
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 65 deletions.
81 changes: 51 additions & 30 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15237,39 +15237,60 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
Size *= 2;
});
unsigned StartIdx = 0;
for (unsigned Size : CandidateVFs) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
assert(
all_of(
Slice,
[&](Value *V) {
return cast<StoreInst>(V)->getValueOperand()->getType() ==
cast<StoreInst>(Slice.front())
->getValueOperand()
->getType();
}) &&
"Expected all operands of same type.");
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
.second &&
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;
// If we vectorized initial block, no need to try to vectorize it
// again.
if (Cnt == StartIdx)
StartIdx += Size;
Cnt += Size;
continue;
unsigned Repeat = 0;
constexpr unsigned MaxAttempts = 2;
while (true) {
++Repeat;
for (unsigned Size : CandidateVFs) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
assert(
all_of(
Slice,
[&](Value *V) {
return cast<StoreInst>(V)->getValueOperand()->getType() ==
cast<StoreInst>(Slice.front())
->getValueOperand()
->getType();
}) &&
"Expected all operands of same type.");
if (!VectorizedStores.count(Slice.front()) &&
!VectorizedStores.count(Slice.back()) &&
TriedSequences
.insert(std::make_pair(Slice.front(), Slice.back()))
.second &&
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
// Mark the vectorized stores so that we don't vectorize them
// again.
VectorizedStores.insert(Slice.begin(), Slice.end());
Changed = true;
// If we vectorized initial block, no need to try to vectorize
// it again.
if (Cnt == StartIdx)
StartIdx += Size;
Cnt += Size;
continue;
}
++Cnt;
}
// Check if the whole array was vectorized already - exit.
if (StartIdx >= Operands.size()) {
Repeat = MaxAttempts;
break;
}
++Cnt;
}
// Check if the whole array was vectorized already - exit.
if (StartIdx >= Operands.size())
// Check if tried all attempts or no need for the last attempts at all.
if (Repeat >= MaxAttempts)
break;
const unsigned MaxTotalNum = bit_floor(Operands.size() - StartIdx);
if (MaxVF >= MaxTotalNum)
break;
// Last attempt to vectorize max number of elements, if all previous
// attempts were unsuccessful because of the cost issues.
CandidateVFs.clear();
for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) {
CandidateVFs.push_back(Size);
}
}
}
};
Expand Down
46 changes: 11 additions & 35 deletions llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
; SSE-LABEL: @store_i64(
; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: ret void
;
; AVX-LABEL: @store_i64(
Expand Down

0 comments on commit 7d4e8c1

Please sign in to comment.