diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 8f4ce80ada5ed..eb0d17bdd44b4 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1443,7 +1443,7 @@ class TargetTransformInfo { /// Index = -1 to indicate that there is no information about the index value. LLVM_ABI InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, - unsigned Index, TTI::TargetCostKind CostKind) const; + int Index, TTI::TargetCostKind CostKind) const; /// \return The expected cost of control-flow related instructions such as /// Phi, Ret, Br, Switch. @@ -1465,6 +1465,13 @@ class TargetTransformInfo { OperandValueInfo Op2Info = {OK_AnyValue, OP_None}, const Instruction *I = nullptr) const; + enum : int { + UnknownIndex = -1, + LastIndex = -2, + }; + + static inline bool isKnownVectorIndex(int Index) { return Index >= 0; } + /// \return The expected cost of vector Insert and Extract. /// Use -1 to indicate that there is no information on the index value. /// This is used when the instruction is not available; a typical use @@ -1472,7 +1479,7 @@ class TargetTransformInfo { /// vectorizer passes. LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index = -1, + int Index = UnknownIndex, const Value *Op0 = nullptr, const Value *Op1 = nullptr) const; @@ -1486,7 +1493,7 @@ class TargetTransformInfo { /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. LLVM_ABI InstructionCost getVectorInstrCost( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, Value *Scalar, ArrayRef> ScalarUserAndIdx) const; @@ -1498,7 +1505,7 @@ class TargetTransformInfo { /// exists (e.g., from basic blocks during transformation). LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index = -1) const; + int Index = UnknownIndex) const; /// \return The expected cost of aggregate inserts and extracts. This is /// used when the instruction is not available; a typical use case is to diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index a80b4c5179bad..e8037a2e208ab 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -758,7 +758,7 @@ class TargetTransformInfoImplBase { virtual InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, - unsigned Index, TTI::TargetCostKind CostKind) const { + int Index, TTI::TargetCostKind CostKind) const { return 1; } @@ -781,7 +781,7 @@ class TargetTransformInfoImplBase { virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { return 1; } @@ -791,7 +791,7 @@ class TargetTransformInfoImplBase { /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. virtual InstructionCost getVectorInstrCost( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { return 1; @@ -799,7 +799,7 @@ class TargetTransformInfoImplBase { virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index) const { + int Index) const { return 1; } @@ -1522,7 +1522,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { auto *IE = dyn_cast(U); if (!IE) return TTI::TCC_Basic; // FIXME - unsigned Idx = -1; + int Idx = TargetTransformInfo::UnknownIndex; if (auto *CI = dyn_cast(Operands[2])) if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); @@ -1641,7 +1641,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { auto *EEI = dyn_cast(U); if (!EEI) return TTI::TCC_Basic; // FIXME - unsigned Idx = -1; + int Idx = TargetTransformInfo::UnknownIndex; if (auto *CI = dyn_cast(Operands[1])) if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 574152e254f15..e9f2698ccbf8e 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1341,7 +1341,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, - unsigned Index, + int Index, TTI::TargetCostKind CostKind) const override { return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, Index, nullptr, nullptr) + @@ -1409,8 +1409,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override { return getRegUsageForType(Val->getScalarType()); } @@ -1420,8 +1420,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, Value *Scalar, + TTI::TargetCostKind CostKind, int Index, + Value *Scalar, ArrayRef> ScalarUserAndIdx) const override { return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, @@ -1430,7 +1430,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index) const override { + int Index) const override { Value *Op0 = nullptr; Value *Op1 = nullptr; if (auto *IE = dyn_cast(&I)) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 2d053e55bdfa9..86846009fa60a 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1058,7 +1058,7 @@ InstructionCost TargetTransformInfo::getCastInstrCost( } InstructionCost TargetTransformInfo::getExtractWithExtendCost( - unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, + unsigned Opcode, Type *Dst, VectorType *VecTy, int Index, TTI::TargetCostKind CostKind) const { InstructionCost Cost = TTIImpl->getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind); @@ -1088,7 +1088,7 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost( } InstructionCost TargetTransformInfo::getVectorInstrCost( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, const Value *Op0, const Value *Op1) const { assert((Opcode == Instruction::InsertElement || Opcode == Instruction::ExtractElement) && @@ -1100,7 +1100,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost( } InstructionCost TargetTransformInfo::getVectorInstrCost( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { assert((Opcode == Instruction::InsertElement || @@ -1115,7 +1115,7 @@ InstructionCost TargetTransformInfo::getVectorInstrCost( InstructionCost TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index) const { + int Index) const { // FIXME: Assert that Opcode is either InsertElement or ExtractElement. // This is mentioned in the interface description and respected by all // callers, but never asserted upon. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0232ac421aeda..94711dd63d0d1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3642,7 +3642,7 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst, - VectorType *VecTy, unsigned Index, + VectorType *VecTy, int Index, TTI::TargetCostKind CostKind) const { // Make sure we were given a valid extend opcode. @@ -3711,12 +3711,24 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, } InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, bool HasRealUse, const Instruction *I, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { assert(Val->isVectorTy() && "This must be a vector type"); - if (Index != -1U) { + if (Index == TargetTransformInfo::LastIndex) { + if (isa(Val)) { + // This typically requires both while and lastb instructions in order + // to extract the last element. If this is in a loop the while + // instruction can at least be hoisted out, although it will consume a + // predicate register. The cost should be more expensive than the base + // extract cost, which is 2 for most CPUs. + return CostKind == TTI::TCK_CodeSize ? 2 : 3; + } + Index = cast(Val)->getNumElements() - 1; + } + + if (TargetTransformInfo::isKnownVectorIndex(Index)) { // Legalize the type. std::pair LT = getTypeLegalizationCost(Val); @@ -3884,8 +3896,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, - const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { bool HasRealUse = Opcode == Instruction::InsertElement && Op0 && !isa(Op0); @@ -3893,7 +3904,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } InstructionCost AArch64TTIImpl::getVectorInstrCost( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr, @@ -3903,7 +3914,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost( InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index) const { + int Index) const { return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, true /* HasRealUse */, &I); } @@ -4052,10 +4063,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // loading the vector from constant pool or in some cases, may also result // in scalarization. For now, we are approximating this with the // scalarization cost. - auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, - CostKind, -1, nullptr, nullptr); - auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty, - CostKind, -1, nullptr, nullptr); + auto ExtractCost = + 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, + TargetTransformInfo::UnknownIndex, nullptr, + nullptr); + auto InsertCost = getVectorInstrCost( + Instruction::InsertElement, Ty, CostKind, + TargetTransformInfo::UnknownIndex, nullptr, nullptr); unsigned NElts = cast(Ty)->getNumElements(); return ExtractCost + InsertCost + NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(), @@ -4153,9 +4167,11 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // On AArch64, without SVE, vector divisions are expanded // into scalar divisions of each pair of elements. Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, - -1, nullptr, nullptr); - Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1, - nullptr, nullptr); + TargetTransformInfo::UnknownIndex, nullptr, + nullptr); + Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, + TargetTransformInfo::UnknownIndex, nullptr, + nullptr); } // TODO: if one of the arguments is scalar, then it's not necessary to @@ -4186,11 +4202,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( return LT.first; return cast(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + - getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1, - nullptr, nullptr) * + getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, + TargetTransformInfo::UnknownIndex, nullptr, + nullptr) * 2 + - getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1, - nullptr, nullptr)); + getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, + TargetTransformInfo::UnknownIndex, nullptr, + nullptr)); case ISD::ADD: case ISD::XOR: case ISD::OR: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 664c360032ea3..96dc151eec783 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -73,7 +73,7 @@ class AArch64TTIImpl : public BasicTTIImplBase { /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. InstructionCost getVectorInstrCostHelper( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr, ArrayRef> ScalarUserAndIdx = {}) const; @@ -197,15 +197,15 @@ class AArch64TTIImpl : public BasicTTIImplBase { InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, - unsigned Index, + int Index, TTI::TargetCostKind CostKind) const override; InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) const override; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; /// \param ScalarUserAndIdx encodes the information about extracts from a @@ -213,14 +213,14 @@ class AArch64TTIImpl : public BasicTTIImplBase { /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, Value *Scalar, + TTI::TargetCostKind CostKind, int Index, + Value *Scalar, ArrayRef> ScalarUserAndIdx) const override; InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index) const override; + int Index) const override; InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 58bfc0b80b24f..3eb0b02f47d32 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -834,7 +834,7 @@ GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { switch (Opcode) { case Instruction::ExtractElement: @@ -853,7 +853,7 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, // operations, and we don't have to copy into a different register class. // Dynamic indexing isn't free and is best avoided. - return Index == ~0u ? 2 : 0; + return TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 2; } default: return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index ec298c7e9631a..7726fa31949da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -169,8 +169,8 @@ class GCNTTIImpl final : public BasicTTIImplBase { using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp index 3093227279a31..3bd4a20390e32 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -110,8 +110,7 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode, InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, - unsigned Index, - const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { switch (Opcode) { case Instruction::ExtractElement: @@ -128,7 +127,7 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, // operations, and we don't have to copy into a different register class. // Dynamic indexing isn't free and is best avoided. - return Index == ~0u ? 2 : 0; + return TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 2; } default: return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h index 3deae69bfc8c9..2bcc47a01eb05 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -63,8 +63,8 @@ class R600TTIImpl final : public BasicTTIImplBase { const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; }; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 6c3a1ae7e1775..e0d89ea5d5325 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -901,7 +901,7 @@ InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { // Penalize inserting into an D-subregister. We end up with a three times // lower estimated throughput on swift. diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 20a2c59511087..36b988215c5d3 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -253,8 +253,8 @@ class ARMTTIImpl : public BasicTTIImplBase { using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index a4cc472fdbf29..f5619d8931fe1 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -316,11 +316,10 @@ InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, - const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { - Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() - : Val; + Type *ElemTy = + Val->isVectorTy() ? cast(Val)->getElementType() : Val; if (Opcode == Instruction::InsertElement) { // Need two rotations for non-zero index. unsigned Cost = (Index != 0) ? 2 : 0; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index d7509c3bb1d2f..676dd2cabb045 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -154,8 +154,8 @@ class HexagonTTIImpl : public BasicTTIImplBase { const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; InstructionCost diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index f9e77f2abdca2..e2eb108167c8e 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -674,7 +674,7 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost( InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); @@ -702,7 +702,8 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // Computing on 1 bit values requires extra mask or compare operations. unsigned MaskCostForOneBitSize = (VecMaskCost && EltSize == 1) ? 1 : 0; // Computing on non const index requires extra mask or compare operations. - unsigned MaskCostForIdx = (Index != -1U) ? 0 : 1; + unsigned MaskCostForIdx = + TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 1; if (ST->hasP9Altivec()) { // P10 has vxform insert which can handle non const index. The // MaskCostForIdx is for masking the index. @@ -711,13 +712,13 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (ISD == ISD::INSERT_VECTOR_ELT) { if (ST->hasP10Vector()) return CostFactor + MaskCostForIdx; - if (Index != -1U) + if (TargetTransformInfo::isKnownVectorIndex(Index)) return 2 * CostFactor; } else if (ISD == ISD::EXTRACT_VECTOR_ELT) { // It's an extract. Maybe we can do a cheap move-from VSR. unsigned EltSize = Val->getScalarSizeInBits(); // P9 has both mfvsrd and mfvsrld for 64 bit integer. - if (EltSize == 64 && Index != -1U) + if (EltSize == 64 && TargetTransformInfo::isKnownVectorIndex(Index)) return 1; if (EltSize == 32) { unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1; @@ -734,7 +735,8 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // (invariant, easily schedulable). return CostFactor + MaskCostForOneBitSize + MaskCostForIdx; } - } else if (ST->hasDirectMove() && Index != -1U) { + } else if (ST->hasDirectMove() && + TargetTransformInfo::isKnownVectorIndex(Index)) { // Assume permute has standard cost. // Assume move-to/move-from VSR have 2x standard cost. if (ISD == ISD::INSERT_VECTOR_ELT) diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 361b2ff223ea0..42c6ffa746fcb 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -129,8 +129,8 @@ class PPCTTIImpl : public BasicTTIImplBase { const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; InstructionCost getMemoryOpCost( unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index bee47527cf428..2f7647e2501f8 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -2211,8 +2211,7 @@ InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, - const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); @@ -2227,7 +2226,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (!LT.second.isVector()) { auto *FixedVecTy = cast(Val); // If Index is a known constant, cost is zero. - if (Index != -1U) + if (TargetTransformInfo::isKnownVectorIndex(Index)) return 0; // Extract/InsertElement with non-constant index is very costly when // scalarized; estimate cost of loads/stores sequence via the stack: @@ -2280,7 +2279,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // When insertelement we should add the index with 1 as the input of vslideup. unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; - if (Index != -1U) { + if (TargetTransformInfo::isKnownVectorIndex(Index)) { // The type may be split. For fixed-width vectors we can normalize the // index to the new type. if (LT.second.isFixedLengthVector()) { @@ -2309,14 +2308,15 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // When the vector needs to split into multiple register groups and the index // exceeds single vector register group, we need to insert/extract the element // via stack. - if (LT.first > 1 && - ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() && - LT.second.isScalableVector()))) { + if (LT.first > 1 && (!TargetTransformInfo::isKnownVectorIndex(Index) || + (Index >= LT.second.getVectorMinNumElements() && + LT.second.isScalableVector()))) { Type *ScalarType = Val->getScalarType(); Align VecAlign = DL.getPrefTypeAlign(Val); Align SclAlign = DL.getPrefTypeAlign(ScalarType); // Extra addi for unknown index. - InstructionCost IdxCost = Index == -1U ? 1 : 0; + InstructionCost IdxCost = + TargetTransformInfo::isKnownVectorIndex(Index) ? 0 : 1; // Store all split vectors into stack and load the target element. if (Opcode == Instruction::ExtractElement) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 0a784461d67bf..131fe30325216 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -244,8 +244,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; InstructionCost getArithmeticInstrCost( diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 68ba7498d586b..6f9d720896c25 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -1194,8 +1194,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost( InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, - const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { if (Opcode == Instruction::InsertElement) { // Vector Element Load. @@ -1205,8 +1204,11 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // vlvgp will insert two grs into a vector register, so count half the // number of instructions as an estimate when we don't have the full // picture (as in getScalarizationOverhead()). - if (Val->isIntOrIntVectorTy(64)) + if (Val->isIntOrIntVectorTy(64)) { + if (!TargetTransformInfo::isKnownVectorIndex(Index)) + return 0; return ((Index % 2 == 0) ? 1 : 0); + } } if (Opcode == Instruction::ExtractElement) { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index b4bc41974b70b..3e1462338deee 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -122,8 +122,8 @@ class SystemZTTIImpl : public BasicTTIImplBase { const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) const; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 978e08bb89551..61ffc47e52cfa 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -183,13 +183,13 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost( } InstructionCost WebAssemblyTTIImpl::getVectorInstrCost( - unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, int Index, const Value *Op0, const Value *Op1) const { InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost( Opcode, Val, CostKind, Index, Op0, Op1); // SIMD128's insert/extract currently only take constant indices. - if (Index == -1u) + if (!TargetTransformInfo::isKnownVectorIndex(Index)) return Cost + 25 * TargetTransformInfo::TCC_Expensive; return Cost; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 6b6d060076a80..38d97699f288f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -80,8 +80,8 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase { const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; InstructionCost getPartialReductionCost( unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index a1a177528eb23..fc7aab6b41b34 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4767,7 +4767,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + int Index, const Value *Op0, const Value *Op1) const { static const CostTblEntry SLMCostTbl[] = { { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, @@ -4782,8 +4782,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, // Non-immediate extraction/insertion can be handled as a sequence of // aliased loads+stores via the stack. - if (Index == -1U && (Opcode == Instruction::ExtractElement || - Opcode == Instruction::InsertElement)) { + if (!TargetTransformInfo::isKnownVectorIndex(Index) && + (Opcode == Instruction::ExtractElement || + Opcode == Instruction::InsertElement)) { // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. @@ -4807,8 +4808,9 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } } - if (Index != -1U && (Opcode == Instruction::ExtractElement || - Opcode == Instruction::InsertElement)) { + if (TargetTransformInfo::isKnownVectorIndex(Index) && + (Opcode == Instruction::ExtractElement || + Opcode == Instruction::InsertElement)) { // Extraction of vXi1 elements are now efficiently handled by MOVMSK. if (Opcode == Instruction::ExtractElement && ScalarType->getScalarSizeInBits() == 1 && diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 72673d6fbd80f..58fe7292a1f3d 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -165,8 +165,8 @@ class X86TTIImpl : public BasicTTIImplBase { const Instruction *I = nullptr) const override; using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, - TTI::TargetCostKind CostKind, - unsigned Index, const Value *Op0, + TTI::TargetCostKind CostKind, int Index, + const Value *Op0, const Value *Op1) const override; InstructionCost getScalarizationOverhead( VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fa313243a57da..09369fbdce390 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5342,17 +5342,16 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, StoreInst *SI = cast(I); bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); - // TODO: We have existing tests that request the cost of extracting element - // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent - // the actual generated code, which involves extracting the last element of - // a scalable vector where the lane to extract is unknown at compile time. - return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, - CostKind) + - (IsLoopInvariantStoreValue - ? 0 - : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - CostKind, VF.getKnownMinValue() - 1)); + InstructionCost Cost = + TTI.getAddressComputationCost(ValTy) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + + (IsLoopInvariantStoreValue + ? 0 + : TTI.getVectorInstrCost( + Instruction::ExtractElement, VectorTy, CostKind, + VF.isScalable() ? TargetTransformInfo::LastIndex + : VF.getKnownMinValue() - 1)); + return Cost; } InstructionCost diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ccce0e07e4d0a..4707a54744174 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -791,6 +791,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { + case VPInstruction::ExtractLastElement: { + // Add on the cost of extracting the element. + auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, + Ctx.CostKind, + TargetTransformInfo::LastIndex); + } case Instruction::ExtractElement: { // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 2c0fb797d1d10..bcac0d434ecee 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -917,32 +917,23 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFNONE-NEXT: [[ENTRY:.*]]: ; TFNONE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFNONE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2 -; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; TFNONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 ; TFNONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; TFNONE: [[VECTOR_PH]]: -; TFNONE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 -; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] +; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; TFNONE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFNONE: [[VECTOR_BODY]]: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFNONE-NEXT: [[TMP7:%.*]] = load double, ptr [[P2]], align 8 -; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[TMP7]], i64 0 -; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TFNONE-NEXT: [[TMP8:%.*]] = call @exp_masked_scalable( [[BROADCAST_SPLAT]], splat (i1 true)) -; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt [[TMP8]], zeroinitializer -; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], zeroinitializer, splat (double 1.000000e+00) -; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; TFNONE-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 2 -; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 -; TFNONE-NEXT: [[TMP14:%.*]] = extractelement [[PREDPHI]], i32 [[TMP13]] +; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 +; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; TFNONE-NEXT: [[TMP2:%.*]] = call <2 x double> @exp_fixed(<2 x double> [[BROADCAST_SPLAT]]) +; TFNONE-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer +; TFNONE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) +; TFNONE-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 ; TFNONE-NEXT: store double [[TMP14]], ptr [[P]], align 8 -; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; TFNONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TFNONE: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll index e7fdfbcf76caa..2816d94d96c3a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll @@ -8,7 +8,10 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-LABEL: define void @vf_will_not_generate_any_vector_insts( ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP0]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4 ; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 4 @@ -17,23 +20,27 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT2]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP6]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[BROADCAST_SPLAT5]], <2 x ptr> [[BROADCAST_SPLAT3]], i32 4, <2 x i1> splat (i1 true)), !alias.scope [[META3]], !noalias [[META0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 -; CHECK-NEXT: br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SRC]], align 4, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.nxv1i32.nxv1p0( [[BROADCAST_SPLAT3]], [[BROADCAST_SPLAT]], i32 4, splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ]