@@ -1126,6 +1126,9 @@ class BoUpSLP {
1126
1126
void
1127
1127
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1128
1129
+ /// Transforms graph nodes to target specific representations, if profitable.
1130
+ void transformNodes();
1131
+
1129
1132
/// Clear the internal data structures that are created by 'buildTree'.
1130
1133
void deleteTree() {
1131
1134
VectorizableTree.clear();
@@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
7813
7816
return std::make_pair(ScalarCost, VecCost);
7814
7817
}
7815
7818
7819
+ void BoUpSLP::transformNodes() {
7820
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7821
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822
+ TreeEntry &E = *TE.get();
7823
+ switch (E.getOpcode()) {
7824
+ case Instruction::Load: {
7825
+ Type *ScalarTy = E.getMainOp()->getType();
7826
+ auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
7827
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
7828
+ // Check if profitable to represent consecutive load + reverse as strided
7829
+ // load with stride -1.
7830
+ if (isReverseOrder(E.ReorderIndices) &&
7831
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
7832
+ SmallVector<int> Mask;
7833
+ inversePermutation(E.ReorderIndices, Mask);
7834
+ auto *BaseLI = cast<LoadInst>(E.Scalars.back());
7835
+ InstructionCost OriginalVecCost =
7836
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
7837
+ BaseLI->getPointerAddressSpace(), CostKind,
7838
+ TTI::OperandValueInfo()) +
7839
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
7840
+ InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
7841
+ Instruction::Load, VecTy, BaseLI->getPointerOperand(),
7842
+ /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
7843
+ if (StridedCost < OriginalVecCost)
7844
+ // Strided load is more profitable than consecutive load + reverse -
7845
+ // transform the node to strided load.
7846
+ E.State = TreeEntry::StridedVectorize;
7847
+ }
7848
+ break;
7849
+ }
7850
+ default:
7851
+ break;
7852
+ }
7853
+ }
7854
+ }
7855
+
7816
7856
/// Merges shuffle masks and emits final shuffle instruction, if required. It
7817
7857
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7818
7858
/// when the actual shuffle instruction is generated only if this is actually
@@ -15189,6 +15229,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15189
15229
R.buildExternalUses();
15190
15230
15191
15231
R.computeMinimumValueSizes();
15232
+ R.transformNodes();
15192
15233
15193
15234
InstructionCost Cost = R.getTreeCost();
15194
15235
@@ -15567,6 +15608,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15567
15608
R.buildExternalUses();
15568
15609
15569
15610
R.computeMinimumValueSizes();
15611
+ R.transformNodes();
15570
15612
InstructionCost Cost = R.getTreeCost();
15571
15613
CandidateFound = true;
15572
15614
MinCost = std::min(MinCost, Cost);
@@ -16563,6 +16605,7 @@ class HorizontalReduction {
16563
16605
V.buildExternalUses(LocalExternallyUsedValues);
16564
16606
16565
16607
V.computeMinimumValueSizes();
16608
+ V.transformNodes();
16566
16609
16567
16610
// Estimate cost.
16568
16611
InstructionCost TreeCost = V.getTreeCost(VL);
0 commit comments