Skip to content

Commit 0ab0c1d

Browse files
[SLP]Introduce transformNodes() and transform loads + reverse to strided loads.
Introduced transformNodes() function to perform transformation of the nodes (cost-based, instruction count based, etc.). Implemented transformation of consecutive loads + reverse order to strided loads with stride -1, if profitable. Reviewers: RKSimon, preames, topperc Reviewed By: RKSimon Pull Request: #88530
1 parent 6ad22c8 commit 0ab0c1d

File tree

2 files changed

+45
-3
lines changed

2 files changed

+45
-3
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,9 @@ class BoUpSLP {
11261126
void
11271127
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
11281128

1129+
/// Transforms graph nodes to target specific representations, if profitable.
1130+
void transformNodes();
1131+
11291132
/// Clear the internal data structures that are created by 'buildTree'.
11301133
void deleteTree() {
11311134
VectorizableTree.clear();
@@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
78137816
return std::make_pair(ScalarCost, VecCost);
78147817
}
78157818

7819+
void BoUpSLP::transformNodes() {
7820+
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7821+
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822+
TreeEntry &E = *TE.get();
7823+
switch (E.getOpcode()) {
7824+
case Instruction::Load: {
7825+
Type *ScalarTy = E.getMainOp()->getType();
7826+
auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7827+
Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7828+
// Check if profitable to represent consecutive load + reverse as strided
7829+
// load with stride -1.
7830+
if (isReverseOrder(E.ReorderIndices) &&
7831+
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7832+
SmallVector<int> Mask;
7833+
inversePermutation(E.ReorderIndices, Mask);
7834+
auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7835+
InstructionCost OriginalVecCost =
7836+
TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
7837+
BaseLI->getPointerAddressSpace(), CostKind,
7838+
TTI::OperandValueInfo()) +
7839+
::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
7840+
InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
7841+
Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7842+
/*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
7843+
if (StridedCost < OriginalVecCost)
7844+
// Strided load is more profitable than consecutive load + reverse -
7845+
// transform the node to strided load.
7846+
E.State = TreeEntry::StridedVectorize;
7847+
}
7848+
break;
7849+
}
7850+
default:
7851+
break;
7852+
}
7853+
}
7854+
}
7855+
78167856
/// Merges shuffle masks and emits final shuffle instruction, if required. It
78177857
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
78187858
/// when the actual shuffle instruction is generated only if this is actually
@@ -15189,6 +15229,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1518915229
R.buildExternalUses();
1519015230

1519115231
R.computeMinimumValueSizes();
15232+
R.transformNodes();
1519215233

1519315234
InstructionCost Cost = R.getTreeCost();
1519415235

@@ -15567,6 +15608,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
1556715608
R.buildExternalUses();
1556815609

1556915610
R.computeMinimumValueSizes();
15611+
R.transformNodes();
1557015612
InstructionCost Cost = R.getTreeCost();
1557115613
CandidateFound = true;
1557215614
MinCost = std::min(MinCost, Cost);
@@ -16563,6 +16605,7 @@ class HorizontalReduction {
1656316605
V.buildExternalUses(LocalExternallyUsedValues);
1656416606

1656516607
V.computeMinimumValueSizes();
16608+
V.transformNodes();
1656616609

1656716610
// Estimate cost.
1656816611
InstructionCost TreeCost = V.getTreeCost(VL);

llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -240,11 +240,10 @@ define void @test3(ptr %p, ptr noalias %s) {
240240
; CHECK-LABEL: @test3(
241241
; CHECK-NEXT: entry:
242242
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
243+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
243244
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
244-
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
245245
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
246-
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
247-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
246+
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
248247
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
249248
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
250249
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)