diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8ae38550d3095..5c265fd4862cd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1126,6 +1126,9 @@ class BoUpSLP { void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); + /// Transforms graph nodes to target specific representations, if profitable. + void transformNodes(); + /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { VectorizableTree.clear(); @@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, return std::make_pair(ScalarCost, VecCost); } +void BoUpSLP::transformNodes() { + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + for (std::unique_ptr &TE : VectorizableTree) { + TreeEntry &E = *TE.get(); + switch (E.getOpcode()) { + case Instruction::Load: { + Type *ScalarTy = E.getMainOp()->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size()); + Align CommonAlignment = computeCommonAlignment(E.Scalars); + // Check if profitable to represent consecutive load + reverse as strided + // load with stride -1. + if (isReverseOrder(E.ReorderIndices) && + TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { + SmallVector Mask; + inversePermutation(E.ReorderIndices, Mask); + auto *BaseLI = cast(E.Scalars.back()); + InstructionCost OriginalVecCost = + TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(), + BaseLI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo()) + + ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind); + InstructionCost StridedCost = TTI->getStridedMemoryOpCost( + Instruction::Load, VecTy, BaseLI->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI); + if (StridedCost < OriginalVecCost) + // Strided load is more profitable than consecutive load + reverse - + // transform the node to strided load. + E.State = TreeEntry::StridedVectorize; + } + break; + } + default: + break; + } + } +} + /// Merges shuffle masks and emits final shuffle instruction, if required. It /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, /// when the actual shuffle instruction is generated only if this is actually @@ -15135,6 +15175,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, R.buildExternalUses(); R.computeMinimumValueSizes(); + R.transformNodes(); InstructionCost Cost = R.getTreeCost(); @@ -15534,6 +15575,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, R.buildExternalUses(); R.computeMinimumValueSizes(); + R.transformNodes(); InstructionCost Cost = R.getTreeCost(); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -16530,6 +16572,7 @@ class HorizontalReduction { V.buildExternalUses(LocalExternallyUsedValues); V.computeMinimumValueSizes(); + V.transformNodes(); // Estimate cost. InstructionCost TreeCost = V.getTreeCost(VL); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll index 03acc0009fb04..44d320c75fedd 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll @@ -240,11 +240,10 @@ define void @test3(ptr %p, ptr noalias %s) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23 ; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> , i32 8) ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]] ; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void