diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f887b34e76422..ce40c6ccba92e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4256,6 +4256,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPDerivedIVSC: case VPDef::VPScalarIVStepsSC: case VPDef::VPReplicateSC: + case VPDef::VPReverseInterleavePtrSC: case VPDef::VPInstructionSC: case VPDef::VPCanonicalIVPHISC: case VPDef::VPVectorPointerSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f3306ad7cb8ec..daef26fe86d79 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: + case VPRecipeBase::VPReverseInterleavePtrSC: case VPRecipeBase::VPMulAccumulateReductionSC: case VPRecipeBase::VPExtendedReductionSC: case VPRecipeBase::VPReplicateSC: @@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags { R->getVPDefID() == VPRecipeBase::VPReductionSC || R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || + R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || R->getVPDefID() == VPRecipeBase::VPVectorPointerSC || R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || @@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, #endif }; +class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags { + Type *IndexedTy; + unsigned Factor; + +public: + VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, + unsigned Factor, GEPNoWrapFlags GEPFlags, + DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC, + ArrayRef({Ptr, VF}), GEPFlags, DL), + IndexedTy(IndexedTy), Factor(Factor) { + assert(Factor >= 2 && Factor <= 8 && "Unexpected factor"); + } + + VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC) + + VPValue *getPtr() const { return getOperand(0); } + + VPValue *getVFValue() const { return getOperand(1); } + + void execute(VPTransformState &State) override; + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + + VPReverseInterleavePtrRecipe *clone() override { + return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy, + Factor, getGEPNoWrapFlags(), + getDebugLoc()); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A pure virtual base class for all recipes modeling header phis, including /// phis for first order recurrences, pointer inductions and reductions. The /// start value is the first operand of the recipe and the incoming value from diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 76da5b0314a8e..98889cb5c520c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -282,9 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { .Case([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1ed0b97849a8d..40dde8cfaea73 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPDerivedIVSC: case VPFirstOrderRecurrencePHISC: case VPPredInstPHISC: + case VPReverseInterleavePtrSC: case VPVectorEndPointerSC: return false; case VPInstructionSC: @@ -2262,6 +2263,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) { + auto &Builder = State.Builder; + Value *Ptr = State.get(getPtr(), /*IsScalar*/ true); + Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true); + Type *IndexTy = Builder.getInt32Ty(); + if (RuntimeVF->getType() != IndexTy) + RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy); + Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1)); + Index = Builder.CreateMul(Index, Builder.getInt32(Factor)); + Index = Builder.CreateNeg(Index); + Value *ReversePtr = + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); + + State.set(this, ReversePtr, /*IsScalar*/ true); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent; + printAsOperand(O, SlotTracker); + O << " = reverse-interleave-ptr"; + printFlags(O); + printOperands(O, SlotTracker); +} +#endif + void VPBlendRecipe::execute(VPTransformState &State) { assert(isNormalized() && "Expected blend to be normalized!"); // We know that all PHIs in non-header blocks are converted into @@ -3223,25 +3251,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { if (auto *I = dyn_cast(ResAddr)) State.setDebugLocFrom(I->getDebugLoc()); - // If the group is reverse, adjust the index to refer to the last vector lane - // instead of the first. We adjust the index from the first vector lane, - // rather than directly getting the pointer for lane VF - 1, because the - // pointer operand of the interleaved access is supposed to be uniform. - if (Group->isReverse()) { - Value *RuntimeVF = - getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF); - Value *Index = - State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1)); - Index = State.Builder.CreateMul(Index, - State.Builder.getInt32(Group->getFactor())); - Index = State.Builder.CreateNeg(Index); - - bool InBounds = false; - if (auto *Gep = dyn_cast(ResAddr->stripPointerCasts())) - InBounds = Gep->isInBounds(); - ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); - } - State.setDebugLocFrom(getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 11f0f2a930329..6068b87663047 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2489,6 +2489,21 @@ void VPlanTransforms::createInterleaveGroups( Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV) : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV); } + // If the group is reverse, adjust the index to refer to the last vector + // lane instead of the first. We adjust the index from the first vector + // lane, rather than directly getting the pointer for lane VF - 1, because + // the pointer operand of the interleaved access is supposed to be uniform. + if (IG->isReverse()) { + auto *GEP = dyn_cast( + getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()); + auto *ReversePtr = new VPReverseInterleavePtrRecipe( + Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(), + GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds() + : GEPNoWrapFlags::none(), + InsertPos->getDebugLoc()); + ReversePtr->insertBefore(InsertPos); + Addr = ReversePtr; + } auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues, InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc()); VPIG->insertBefore(InsertPos); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index a0d3dc9b934cc..83f6ac223af1e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -335,6 +335,7 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPReverseInterleavePtrSC, VPMulAccumulateReductionSC, VPExtendedReductionSC, VPPartialReductionSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 7e4edf739695a..0333035a4b0bf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3 +; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]] @@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1 ; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]] @@ -1579,8 +1579,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] @@ -1599,8 +1599,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] ; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2 ; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]