From 3857530a491e9309cde5572a575d69311b939386 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 10 Jun 2025 12:52:29 +0000 Subject: [PATCH 1/3] [LV] Use getFixedValue instead of getKnownMinValue when appropriate There are many places in VPlan and LoopVectorize where we use getKnownMinValue to discover the number of elements in a vector. Where we expect the vector to have a fixed length, I have used the stronger getFixedValue call. I believe this is clearer and adds extra protection in the form of an assert in getFixedValue that the vector is not scalable. While looking at VPFirstOrderRecurrencePHIRecipe::computeCost I also took the liberty of simplifying the code. In theory I believe this patch should be NFC, but I'm reluctant to add that to the title in case we're just missing tests. I built and ran the LLVM test suite when targeting neoverse-v1 and it seemed ok. --- .../Transforms/Vectorize/LoopVectorize.cpp | 34 +++++++++++-------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 ++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 18 +++++----- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8177b76ad5bdf..f5324e85f0048 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3107,12 +3107,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - ScalarizationCost += VF.getKnownMinValue() * - TTI.getCFInstrCost(Instruction::PHI, CostKind); + ScalarizationCost += + VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - ScalarizationCost += VF.getKnownMinValue() * - TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); + ScalarizationCost += + VF.getFixedValue() * + TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -4280,7 +4281,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, return NumLegalParts <= VF.getKnownMinValue(); } // Two or more elements that share a register - are vectorized. - return NumLegalParts < VF.getKnownMinValue(); + return NumLegalParts < VF.getFixedValue(); }; // If no def nor is a store, e.g., branches, continue - no value to check. @@ -4565,8 +4566,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( assert(!isa(TC) && "Trip count SCEV must be computable"); RemainingIterations = SE.getURemExpr( - TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC)); - MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1; + TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); + MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, SE.getConstant(TCType, MaxTripCount))) { MaxTripCount = @@ -4577,7 +4578,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( } if (SE.isKnownPredicate( CmpInst::ICMP_UGT, - SE.getConstant(TCType, NextVF.Width.getKnownMinValue()), + SE.getConstant(TCType, NextVF.Width.getFixedValue()), RemainingIterations)) continue; } @@ -5248,14 +5249,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Get the cost of the scalar memory instruction and address computation. InstructionCost Cost = - VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), - ValTy->getScalarType(), - Alignment, AS, CostKind); + Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + Alignment, AS, CostKind); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5271,7 +5272,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, auto *VecI1Ty = VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( - VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()), + VecI1Ty, APInt::getAllOnes(VF.getFixedValue()), /*Insert=*/false, /*Extract=*/true, CostKind); Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); @@ -5317,7 +5318,6 @@ InstructionCost LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { assert(Legal->isUniformMemOp(*I, VF)); - Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -5332,6 +5332,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, StoreInst *SI = cast(I); bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); + // TODO: We have tests that request the cost of extracting element + // VF.getKnownMinValue() - 1 from a scalable vector. This is actually + // meaningless, given what we actually want is the last lane and is likely + // to be more expensive. return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + @@ -5614,7 +5618,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, for (Type *VectorTy : getContainedTypes(RetTy)) { Cost += TTI.getScalarizationOverhead( - cast(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()), + cast(VectorTy), APInt::getAllOnes(VF.getFixedValue()), /*Insert=*/true, /*Extract=*/false, CostKind); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1838562f26b82..b5fa98d341756 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { bool IsSingleScalar = vputils::isSingleScalar(Def); - VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1); + VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1); // Check if there is a scalar value for the selected lane. if (!hasScalarValue(Def, LastLane)) { // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and @@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { assert(!VF.isScalable() && "VF is assumed to be non scalable."); Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF)); set(Def, Undef); - for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane) packScalarIntoVectorizedValue(Def, Lane); VectorValue = get(Def); } @@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) { ReversePostOrderTraversal> RPOT( Entry); State->Lane = VPLane(0); - for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; - ++Lane) { + for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) { State->Lane = VPLane(Lane, VPLane::Kind::First); // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aa6b13c217bd1..7894ccd9b4c5a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -872,7 +872,7 @@ void VPInstruction::execute(VPTransformState &State) { isVectorToScalar() || isSingleScalar()); bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); if (GeneratesPerAllLanes) { - for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); + for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue(); Lane != NumLanes; ++Lane) { Value *GeneratedValue = generatePerLane(State, VPLane(Lane)); assert(GeneratedValue && "generatePerLane must produce a value"); @@ -2792,8 +2792,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { } // Generate scalar instances for all VF lanes. - assert(!State.VF.isScalable() && "Can't scalarize a scalable vector"); - const unsigned EndLane = State.VF.getKnownMinValue(); + const unsigned EndLane = State.VF.getFixedValue(); for (unsigned Lane = 0; Lane < EndLane; ++Lane) scalarizeInstruction(UI, this, VPLane(Lane), State); } @@ -2846,7 +2845,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, UI->getOpcode(), ResultTy, CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, UI, &Ctx.TLI) * - (isSingleScalar() ? 1 : VF.getKnownMinValue()); + (isSingleScalar() ? 1 : VF.getFixedValue()); } } @@ -3399,7 +3398,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { Value *ResBlockInMask = State.get(BlockInMask); Value *ShuffledMask = State.Builder.CreateShuffleVector( ResBlockInMask, - createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()), + createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()), "interleaved.mask"); return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) @@ -3411,8 +3410,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { if (isa(Instr)) { Value *MaskForGaps = nullptr; if (NeedsMaskForGaps) { - MaskForGaps = createBitMaskForGaps(State.Builder, - State.VF.getKnownMinValue(), *Group); + MaskForGaps = + createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -3475,13 +3474,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { continue; auto StrideMask = - createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue()); + createStrideMask(I, InterleaveFactor, State.VF.getFixedValue()); Value *StridedVec = State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); StridedVec = createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); @@ -3817,7 +3815,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF, if (VF.isScalar()) return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); - if (VF.isScalable() && VF.getKnownMinValue() == 1) + if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); return 0; From 3957b6c62adfb8fa1333643a00eadfd0dfa503a4 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 11 Jun 2025 12:40:36 +0000 Subject: [PATCH 2/3] Address review comments --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f5324e85f0048..8f8a1ce9d7ad3 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5332,10 +5332,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, StoreInst *SI = cast(I); bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); - // TODO: We have tests that request the cost of extracting element - // VF.getKnownMinValue() - 1 from a scalable vector. This is actually - // meaningless, given what we actually want is the last lane and is likely - // to be more expensive. + // TODO: We have existing tests that request the cost of extracting element + // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent + // the actual generated code, which involves extracting the last element of + // a scalable vector where the lane to extract is unknown at compile time. return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7894ccd9b4c5a..c4c6141cb62c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3462,6 +3462,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { return; } + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); // For each member in the group, shuffle out the appropriate data from the // wide loads. From 4050dc98377f9676895c821b83a3ce90c032a80f Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 12 Jun 2025 12:46:08 +0000 Subject: [PATCH 3/3] Re-add blank line --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8f8a1ce9d7ad3..86bea6afdb738 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5318,6 +5318,7 @@ InstructionCost LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { assert(Legal->isUniformMemOp(*I, VF)); + Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(toVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I);