diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index eabe70027d5a1..a337ab7410f73 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -12197,7 +12197,11 @@ bool BoUpSLP::isTreeNotExtendable() const { TreeEntry &E = *VectorizableTree[Idx]; if (!E.isGather()) continue; - if (E.hasState() && E.getOpcode() != Instruction::Load) + if ((E.hasState() && E.getOpcode() != Instruction::Load) || + (!E.hasState() && + all_of(E.Scalars, IsaPred)) || + (isa(E.Scalars.front()) && + getSameOpcode(ArrayRef(E.Scalars).drop_front(), *TLI).valid())) return false; if (isSplat(E.Scalars) || allConstant(E.Scalars)) continue; @@ -19405,6 +19409,9 @@ class HorizontalReduction { /// Checks if the optimization of original scalar identity operations on /// matched horizontal reductions is enabled and allowed. bool IsSupportedHorRdxIdentityOp = false; + /// Contains vector values for reduction including their scale factor and + /// signedness. + SmallVector> VectorValuesAndScales; static bool isCmpSelMinMax(Instruction *I) { return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && @@ -19455,19 +19462,23 @@ class HorizontalReduction { /// Creates reduction operation with the current opcode. static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS, Value *RHS, const Twine &Name, bool UseSelect) { + Type *OpTy = LHS->getType(); + assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type"); switch (Kind) { case RecurKind::Or: { - if (UseSelect && - LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) - return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name); + if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy)) + return Builder.CreateSelect( + LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)), + RHS, Name); unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); } case RecurKind::And: { - if (UseSelect && - LHS->getType() == CmpInst::makeCmpResultType(LHS->getType())) - return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name); + if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy)) + return Builder.CreateSelect( + LHS, RHS, + ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name); unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); @@ -20342,12 +20353,11 @@ class HorizontalReduction { SameValuesCounter, TrackedToOrig); } - Value *ReducedSubTree; Type *ScalarTy = VL.front()->getType(); if (isa(ScalarTy)) { assert(SLPReVec && "FixedVectorType is not expected."); unsigned ScalarTyNumElements = getNumElements(ScalarTy); - ReducedSubTree = PoisonValue::get(FixedVectorType::get( + Value *ReducedSubTree = PoisonValue::get(getWidenedType( VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements)); for (unsigned I : seq(ScalarTyNumElements)) { // Do reduction for each lane. @@ -20365,30 +20375,33 @@ class HorizontalReduction { SmallVector Mask = createStrideMask(I, ScalarTyNumElements, VL.size()); Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask); - ReducedSubTree = Builder.CreateInsertElement( - ReducedSubTree, - emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I); + Value *Val = + createSingleOp(Builder, *TTI, Lane, + OptReusedScalars && SameScaleFactor + ? SameValuesCounter.front().second + : 1, + Lane->getType()->getScalarType() != + VL.front()->getType()->getScalarType() + ? V.isSignedMinBitwidthRootNode() + : true, + RdxRootInst->getType()); + ReducedSubTree = + Builder.CreateInsertElement(ReducedSubTree, Val, I); } + VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); } else { - ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI, - RdxRootInst->getType()); + Type *VecTy = VectorizedRoot->getType(); + Type *RedScalarTy = VecTy->getScalarType(); + VectorValuesAndScales.emplace_back( + VectorizedRoot, + OptReusedScalars && SameScaleFactor + ? SameValuesCounter.front().second + : 1, + RedScalarTy != ScalarTy->getScalarType() + ? V.isSignedMinBitwidthRootNode() + : true); } - if (ReducedSubTree->getType() != VL.front()->getType()) { - assert(ReducedSubTree->getType() != VL.front()->getType() && - "Expected different reduction type."); - ReducedSubTree = - Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(), - V.isSignedMinBitwidthRootNode()); - } - - // Improved analysis for add/fadd/xor reductions with same scale factor - // for all operands of reductions. We can emit scalar ops for them - // instead. - if (OptReusedScalars && SameScaleFactor) - ReducedSubTree = emitScaleForReusedOps( - ReducedSubTree, Builder, SameValuesCounter.front().second); - VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); // Count vectorized reduced values to exclude them from final reduction. for (Value *RdxVal : VL) { Value *OrigV = TrackedToOrig.at(RdxVal); @@ -20417,6 +20430,10 @@ class HorizontalReduction { continue; } } + if (!VectorValuesAndScales.empty()) + VectorizedTree = GetNewVectorizedTree( + VectorizedTree, + emitReduction(Builder, *TTI, ReductionRoot->getType())); if (VectorizedTree) { // Reorder operands of bool logical op in the natural order to avoid // possible problem with poison propagation. If not possible to reorder @@ -20551,6 +20568,22 @@ class HorizontalReduction { } private: + /// Creates the reduction from the given \p Vec vector value with the given + /// scale \p Scale and signedness \p IsSigned. + Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI, + Value *Vec, unsigned Scale, bool IsSigned, + Type *DestTy) { + Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy); + if (Rdx->getType() != DestTy->getScalarType()) + Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned); + // Improved analysis for add/fadd/xor reductions with same scale + // factor for all operands of reductions. We can emit scalar ops for + // them instead. + if (Scale > 1) + Rdx = emitScaleForReusedOps(Rdx, Builder, Scale); + return Rdx; + } + /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef ReducedVals, @@ -20593,6 +20626,12 @@ class HorizontalReduction { } return Cost; }; + // Require reduction cost if: + // 1. This type is not a full register type and no other vectors with the + // same type in the storage (first vector with small type). + // 2. The storage does not have any vector with full vector use (first + // vector with full register use). + bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty(); switch (RdxKind) { case RecurKind::Add: case RecurKind::Mul: @@ -20616,7 +20655,7 @@ class HorizontalReduction { VectorCost += TTI->getScalarizationOverhead( VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); - } else { + } else if (DoesRequireReductionOp) { Type *RedTy = VectorTy->getElementType(); auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); @@ -20628,6 +20667,20 @@ class HorizontalReduction { RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth), FMF, CostKind); } + } else { + Type *RedTy = VectorTy->getElementType(); + auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( + std::make_pair(RedTy, true)); + VectorType *RVecTy = getWidenedType(RType, ReduxWidth); + VectorCost += + TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); + if (RType != RedTy) { + unsigned Opcode = Instruction::Trunc; + if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) + Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt; + VectorCost += TTI->getCastInstrCost( + Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind); + } } } ScalarCost = EvaluateScalarCost([&]() { @@ -20644,8 +20697,27 @@ class HorizontalReduction { case RecurKind::UMax: case RecurKind::UMin: { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - if (!AllConsts) - VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); + if (!AllConsts) { + if (DoesRequireReductionOp) { + VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind); + } else { + // Check if the previous reduction already exists and account it as + // series of operations + single reduction. + Type *RedTy = VectorTy->getElementType(); + auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( + std::make_pair(RedTy, true)); + VectorType *RVecTy = getWidenedType(RType, ReduxWidth); + IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF); + VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind); + if (RType != RedTy) { + unsigned Opcode = Instruction::Trunc; + if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) + Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt; + VectorCost += TTI->getCastInstrCost( + Opcode, VectorTy, RVecTy, TTI::CastContextHint::None, CostKind); + } + } + } ScalarCost = EvaluateScalarCost([&]() { IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); return TTI->getIntrinsicInstrCost(ICA, CostKind); @@ -20662,6 +20734,160 @@ class HorizontalReduction { return VectorCost - ScalarCost; } + /// Splits the values, stored in VectorValuesAndScales, into registers/free + /// sub-registers, combines them with the given reduction operation as a + /// vector operation and then performs single (small enough) reduction. + Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI, + Type *DestTy) { + Value *ReducedSubTree = nullptr; + // Creates reduction and combines with the previous reduction. + auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) { + Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy); + if (ReducedSubTree) + ReducedSubTree = createOp(Builder, RdxKind, ReducedSubTree, Rdx, + "op.rdx", ReductionOps); + else + ReducedSubTree = Rdx; + }; + if (VectorValuesAndScales.size() == 1) { + const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front(); + CreateSingleOp(Vec, Scale, IsSigned); + return ReducedSubTree; + } + // Scales Vec using given Cnt scale factor and then performs vector combine + // with previous value of VecOp. + Value *VecRes = nullptr; + bool VecResSignedness = false; + auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) { + Type *ScalarTy = Vec->getType()->getScalarType(); + // Scale Vec using given Cnt scale factor. + if (Cnt > 1) { + ElementCount EC = cast(Vec->getType())->getElementCount(); + switch (RdxKind) { + case RecurKind::Add: { + if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) { + unsigned VF = getNumElements(Vec->getType()); + LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec + << ". (HorRdx)\n"); + SmallVector Mask(Cnt * VF, PoisonMaskElem); + for (unsigned I : seq(Cnt)) + std::iota(std::next(Mask.begin(), VF * I), + std::next(Mask.begin(), VF * (I + 1)), 0); + ++NumVectorInstructions; + Vec = Builder.CreateShuffleVector(Vec, Mask); + break; + } + // res = mul vv, n + if (ScalarTy != DestTy->getScalarType()) + Vec = Builder.CreateIntCast( + Vec, getWidenedType(DestTy, getNumElements(Vec->getType())), + IsSigned); + Value *Scale = ConstantVector::getSplat( + EC, ConstantInt::get(DestTy->getScalarType(), Cnt)); + LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec + << ". (HorRdx)\n"); + ++NumVectorInstructions; + Vec = Builder.CreateMul(Vec, Scale); + break; + } + case RecurKind::Xor: { + // res = n % 2 ? 0 : vv + LLVM_DEBUG(dbgs() + << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n"); + if (Cnt % 2 == 0) + Vec = Constant::getNullValue(Vec->getType()); + break; + } + case RecurKind::FAdd: { + // res = fmul v, n + Value *Scale = + ConstantVector::getSplat(EC, ConstantFP::get(ScalarTy, Cnt)); + LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec + << ". (HorRdx)\n"); + ++NumVectorInstructions; + Vec = Builder.CreateFMul(Vec, Scale); + break; + } + case RecurKind::And: + case RecurKind::Or: + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: + case RecurKind::FMax: + case RecurKind::FMin: + case RecurKind::FMaximum: + case RecurKind::FMinimum: + // res = vv + break; + case RecurKind::Mul: + case RecurKind::FMul: + case RecurKind::FMulAdd: + case RecurKind::IAnyOf: + case RecurKind::FAnyOf: + case RecurKind::IFindLastIV: + case RecurKind::FFindLastIV: + case RecurKind::None: + llvm_unreachable("Unexpected reduction kind for repeated scalar."); + } + } + // Combine Vec with the previous VecOp. + if (!VecRes) { + VecRes = Vec; + VecResSignedness = IsSigned; + } else { + ++NumVectorInstructions; + if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) { + // Handle ctpop. + unsigned VecResVF = getNumElements(VecRes->getType()); + unsigned VecVF = getNumElements(Vec->getType()); + SmallVector Mask(VecResVF + VecVF, PoisonMaskElem); + std::iota(Mask.begin(), Mask.end(), 0); + // Ensure that VecRes is always larger than Vec + if (VecResVF < VecVF) { + std::swap(VecRes, Vec); + std::swap(VecResVF, VecVF); + } + if (VecResVF != VecVF) { + SmallVector ResizeMask(VecResVF, PoisonMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), VecVF), 0); + Vec = Builder.CreateShuffleVector(Vec, ResizeMask); + } + VecRes = Builder.CreateShuffleVector(VecRes, Vec, Mask, "rdx.op"); + return; + } + if (VecRes->getType()->getScalarType() != DestTy->getScalarType()) + VecRes = Builder.CreateIntCast( + VecRes, getWidenedType(DestTy, getNumElements(VecRes->getType())), + VecResSignedness); + if (ScalarTy != DestTy->getScalarType()) + Vec = Builder.CreateIntCast( + Vec, getWidenedType(DestTy, getNumElements(Vec->getType())), + IsSigned); + unsigned VecResVF = getNumElements(VecRes->getType()); + unsigned VecVF = getNumElements(Vec->getType()); + // Ensure that VecRes is always larger than Vec + if (VecResVF < VecVF) { + std::swap(VecRes, Vec); + std::swap(VecResVF, VecVF); + } + // extract + op + insert + Value *Op = VecRes; + if (VecResVF != VecVF) + Op = createExtractVector(Builder, VecRes, VecVF, /*Index=*/0); + Op = createOp(Builder, RdxKind, Op, Vec, "rdx.op", ReductionOps); + if (VecResVF != VecVF) + Op = createInsertVector(Builder, VecRes, Op, /*Index=*/0); + VecRes = Op; + } + }; + for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales) + CreateVecOp(Vec, Scale, IsSigned); + CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false); + + return ReducedSubTree; + } + /// Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder, const TargetTransformInfo *TTI, Type *DestTy) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll index ffb8f44363249..a1d3f250b8a83 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll @@ -19,9 +19,8 @@ define void @foo(ptr %0) { ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i1> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_OP]]) ; CHECK-NEXT: br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll index 00a4417ba7aff..677d52bf3b4c3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll @@ -81,10 +81,9 @@ define half @reduce_fast_half8(<8 x half> %vec8) { ; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { ; NOFP16-NEXT: [[ENTRY:.*:]] ; NOFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> -; NOFP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) ; NOFP16-NEXT: [[TMP2:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> -; NOFP16-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]]) -; NOFP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[TMP1]], [[TMP3]] +; NOFP16-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x half> [[TMP0]], [[TMP2]] +; NOFP16-NEXT: [[OP_RDX3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[RDX_OP]]) ; NOFP16-NEXT: ret half [[OP_RDX3]] ; ; FULLFP16-LABEL: define half @reduce_fast_half8( diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll index b5bfdf284ca62..b2246e4f9c6c4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -57,10 +57,9 @@ define half @reduction_half16(<16 x half> %vec16) { ; VI-LABEL: @reduction_half16( ; VI-NEXT: entry: ; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> -; VI-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[TMP0]]) ; VI-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> -; VI-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[TMP2]]) -; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]] +; VI-NEXT: [[RDX_OP:%.*]] = fadd fast <8 x half> [[TMP0]], [[TMP2]] +; VI-NEXT: [[OP_RDX:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[RDX_OP]]) ; VI-NEXT: ret half [[OP_RDX]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll index 0a4a9c74e4c0d..78b5acad0df9a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/horizontal-list.ll @@ -18,7 +18,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-14' +; YAML-NEXT: - Cost: '-15' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '1' ; YAML-NEXT: ... @@ -28,7 +28,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; YAML-NEXT: - Cost: '-4' +; YAML-NEXT: - Cost: '-6' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '1' ; YAML-NEXT:... @@ -45,11 +45,13 @@ define float @test(ptr %x) { ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = call fast <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float> [[TMP0]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <8 x float> [[TMP5]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = call fast <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> [[RDX_OP]], i64 0) +; CHECK-NEXT: [[RDX_OP4:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v16f32(<16 x float> [[TMP6]], i64 0) +; CHECK-NEXT: [[RDX_OP5:%.*]] = fadd fast <4 x float> [[RDX_OP4]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> [[RDX_OP5]], i64 0) +; CHECK-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP8]]) ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] ; CHECK-NEXT: ret float [[OP_RDX3]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 85131758853b3..5b0f4a69de4c3 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -341,13 +341,12 @@ define void @reduce_or_2() { ; ZVFHMIN-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer ; ZVFHMIN-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 ; ZVFHMIN-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer -; ZVFHMIN-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) -; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) -; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] +; ZVFHMIN-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]] +; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) ; ZVFHMIN-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] -; ZVFHMIN: 8: +; ZVFHMIN: 7: ; ZVFHMIN-NEXT: ret void -; ZVFHMIN: 9: +; ZVFHMIN: 8: ; ZVFHMIN-NEXT: ret void ; ; ZVL128-LABEL: @reduce_or_2( @@ -356,13 +355,12 @@ define void @reduce_or_2() { ; ZVL128-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer ; ZVL128-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 ; ZVL128-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer -; ZVL128-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) -; ZVL128-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) -; ZVL128-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] +; ZVL128-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]] +; ZVL128-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) ; ZVL128-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] -; ZVL128: 8: +; ZVL128: 7: ; ZVL128-NEXT: ret void -; ZVL128: 9: +; ZVL128: 8: ; ZVL128-NEXT: ret void ; ; ZVL256-LABEL: @reduce_or_2( @@ -371,13 +369,12 @@ define void @reduce_or_2() { ; ZVL256-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer ; ZVL256-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 ; ZVL256-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer -; ZVL256-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) -; ZVL256-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) -; ZVL256-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] +; ZVL256-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]] +; ZVL256-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) ; ZVL256-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] -; ZVL256: 8: +; ZVL256: 7: ; ZVL256-NEXT: ret void -; ZVL256: 9: +; ZVL256: 8: ; ZVL256-NEXT: ret void ; ; ZVL512-LABEL: @reduce_or_2( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll index cb44d05423007..f2992cf044cd5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=CHECK,AVX -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -S | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX512 ; // PR42652 ; unsigned long bitmask_16xi8(const char *src) { @@ -15,39 +15,110 @@ ; } define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) { -; CHECK-LABEL: @bitmask_16xi8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 -; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> -; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; CHECK-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; CHECK-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 -; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; CHECK-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 -; CHECK-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]]) -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_13]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]] -; CHECK-NEXT: ret i64 [[OP_RDX4]] +; SSE-LABEL: @bitmask_16xi8( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; SSE-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP3]], i64 0) +; SSE-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] +; SSE-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP3]], <4 x i64> [[RDX_OP]], i64 0) +; SSE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) +; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP16]], [[OR_13]] +; SSE-NEXT: [[OP_RDX5:%.*]] = or i64 [[OR_14]], [[OR_15]] +; SSE-NEXT: [[OP_RDX6:%.*]] = or i64 [[OP_RDX]], [[OP_RDX5]] +; SSE-NEXT: [[OP_RDX7:%.*]] = or i64 [[OP_RDX6]], [[OR]] +; SSE-NEXT: ret i64 [[OP_RDX7]] +; +; AVX-LABEL: @bitmask_16xi8( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; AVX-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; AVX-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; AVX-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; AVX-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; AVX-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; AVX-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; AVX-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; AVX-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; AVX-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; AVX-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP3]], i64 0) +; AVX-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] +; AVX-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP3]], <4 x i64> [[RDX_OP]], i64 0) +; AVX-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) +; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]] +; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] +; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX]], [[OP_RDX2]] +; AVX-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]] +; AVX-NEXT: ret i64 [[OP_RDX4]] +; +; AVX512-LABEL: @bitmask_16xi8( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 +; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 +; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> +; AVX512-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 +; AVX512-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 +; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer +; AVX512-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; AVX512-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 +; AVX512-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 +; AVX512-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 +; AVX512-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 +; AVX512-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 +; AVX512-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 +; AVX512-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 +; AVX512-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 +; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 +; AVX512-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 +; AVX512-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP3]], i64 0) +; AVX512-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] +; AVX512-NEXT: [[TMP11:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP3]], <4 x i64> [[RDX_OP]], i64 0) +; AVX512-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]] +; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] +; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX]], [[OP_RDX2]] +; AVX512-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]] +; AVX512-NEXT: ret i64 [[OP_RDX4]] ; entry: %0 = load i8, ptr %src, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll index fb25ff975adc2..9fecef28d3f6a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll @@ -14,9 +14,8 @@ define void @test() { ; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i64> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[LOOP]] ] ; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i64> [[TMP6]], splat (i64 4) ; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX16:%.*]] = add i64 [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[RDX_OP:%.*]] = add <8 x i64> [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[OP_RDX16:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[RDX_OP]]) ; CHECK-NEXT: [[OP_RDX25]] = add i64 [[OP_RDX16]], [[TMP3]] ; CHECK-NEXT: br label [[LOOP]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index c976525b6720e..1c62e57edfc46 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -18,7 +18,7 @@ define i64 @foo(i32 %tmp7) { ; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 0, [[TMP10]] ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll index 71390b643f43d..d884a1af8aab7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll @@ -19,9 +19,10 @@ define i32 @test() { ; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 ; CHECK-NEXT: [[INC_3_3_I_1:%.*]] = or i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP15]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32> [[TMP8]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = or <8 x i32> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP8]], <8 x i32> [[RDX_OP]], i64 0) +; CHECK-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP17]]) ; CHECK-NEXT: ret i32 [[OP_RDX]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 2bcabfad3d09b..2484a2d2193fc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -16,9 +16,9 @@ define float @baz() { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], 2.000000e+00 ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 ; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] ; CHECK-NEXT: store float [[OP_RDX]], ptr @res, align 4 ; CHECK-NEXT: ret float [[OP_RDX]] @@ -32,8 +32,8 @@ define float @baz() { ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] ; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 -; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 1 ; THRESHOLD-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], splat (float 2.000000e+00) ; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 ; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 @@ -605,9 +605,10 @@ define float @loadadd31(ptr nocapture readonly %x) { ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] +; CHECK-NEXT: [[RDX_OP2:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v24f32(<24 x float> [[TMP0]], i64 0) +; CHECK-NEXT: [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call fast <24 x float> @llvm.vector.insert.v24f32.v4f32(<24 x float> [[TMP0]], <4 x float> [[RDX_OP3]], i64 0) +; CHECK-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]]) ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] ; CHECK-NEXT: ret float [[OP_RDX3]] @@ -622,9 +623,10 @@ define float @loadadd31(ptr nocapture readonly %x) { ; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 ; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]]) -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] +; THRESHOLD-NEXT: [[RDX_OP2:%.*]] = call fast <4 x float> @llvm.vector.extract.v4f32.v24f32(<24 x float> [[TMP0]], i64 0) +; THRESHOLD-NEXT: [[RDX_OP3:%.*]] = fadd fast <4 x float> [[RDX_OP2]], [[TMP2]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast <24 x float> @llvm.vector.insert.v24f32.v4f32(<24 x float> [[TMP0]], <4 x float> [[RDX_OP3]], i64 0) +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP5]]) ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] ; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] ; THRESHOLD-NEXT: ret float [[OP_RDX3]] @@ -728,9 +730,9 @@ define float @extra_args(ptr nocapture readonly %x, i32 %a, i32 %b) { ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 ; CHECK-NEXT: ret float [[OP_RDX1]] ; @@ -739,9 +741,9 @@ define float @extra_args(ptr nocapture readonly %x, i32 %a, i32 %b) { ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) ; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; @@ -782,10 +784,10 @@ define float @extra_args_same_several_times(ptr nocapture readonly %x, i32 %a, i ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float 1.300000e+01, [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP3]] ; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @extra_args_same_several_times( @@ -793,10 +795,10 @@ define float @extra_args_same_several_times(ptr nocapture readonly %x, i32 %a, i ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01 ; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float 1.300000e+01, [[TMP2]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP3]] ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: @@ -839,9 +841,9 @@ define float @extra_args_no_replace(ptr nocapture readonly %x, i32 %a, i32 %b, i ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 ; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]] ; CHECK-NEXT: ret float [[OP_RDX2]] @@ -852,9 +854,9 @@ define float @extra_args_no_replace(ptr nocapture readonly %x, i32 %a, i32 %b, i ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float ; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) ; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 ; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]] ; THRESHOLD-NEXT: ret float [[OP_RDX2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index 7626eea85f219..ca662b838938f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -984,22 +984,16 @@ define i32 @maxi8_wrong_parent(i32) { ; SSE4-NEXT: ret i32 [[OP_RDX7]] ; ; AVX-LABEL: @maxi8_wrong_parent( -; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; AVX-NEXT: br label [[PP:%.*]] ; AVX: pp: ; AVX-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) -; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]] -; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]] -; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP6]], [[TMP2]] -; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP6]], i32 [[TMP2]] -; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]] -; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]] -; AVX-NEXT: [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]] -; AVX-NEXT: [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]] +; AVX-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; AVX-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 0) +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 2) +; AVX-NEXT: [[RDX_OP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[TMP6]] +; AVX-NEXT: [[RDX_OP1:%.*]] = select <4 x i1> [[RDX_OP]], <4 x i32> [[TMP4]], <4 x i32> [[TMP6]] +; AVX-NEXT: [[OP_RDX7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_OP1]]) ; AVX-NEXT: ret i32 [[OP_RDX7]] ; ; THRESH-LABEL: @maxi8_wrong_parent( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll index 598ff9a5178c1..ccb7e9b514cf1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll @@ -103,39 +103,15 @@ define i64 @test_3() #0 { ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ 3, [[BB1]] ], [ 3, [[BB2:%.*]] ] -; CHECK-NEXT: [[VAL4:%.*]] = phi i32 [ 3, [[BB1]] ], [ 3, [[BB2]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ splat (i32 3), [[BB1]] ], [ poison, [[BB2:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <28 x i32> +; CHECK-NEXT: [[VAL4:%.*]] = extractelement <28 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <32 x i32> poison, i32 [[VAL4]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <32 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP2]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX7:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX8:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX9:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX10:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX11:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX12:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX13:%.*]] = mul i32 [[VAL4]], [[VAL4]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = mul i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX15:%.*]] = mul i32 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[OP_RDX16:%.*]] = mul i32 [[OP_RDX4]], [[OP_RDX5]] -; CHECK-NEXT: [[OP_RDX17:%.*]] = mul i32 [[OP_RDX6]], [[OP_RDX7]] -; CHECK-NEXT: [[OP_RDX18:%.*]] = mul i32 [[OP_RDX8]], [[OP_RDX9]] -; CHECK-NEXT: [[OP_RDX19:%.*]] = mul i32 [[OP_RDX10]], [[OP_RDX11]] -; CHECK-NEXT: [[OP_RDX20:%.*]] = mul i32 [[OP_RDX12]], [[OP_RDX13]] -; CHECK-NEXT: [[OP_RDX21:%.*]] = mul i32 [[OP_RDX14]], [[OP_RDX15]] -; CHECK-NEXT: [[OP_RDX22:%.*]] = mul i32 [[OP_RDX16]], [[OP_RDX17]] -; CHECK-NEXT: [[OP_RDX23:%.*]] = mul i32 [[OP_RDX18]], [[OP_RDX19]] -; CHECK-NEXT: [[OP_RDX24:%.*]] = mul i32 [[OP_RDX20]], [[VAL]] -; CHECK-NEXT: [[OP_RDX25:%.*]] = mul i32 [[OP_RDX21]], [[OP_RDX22]] -; CHECK-NEXT: [[OP_RDX26:%.*]] = mul i32 [[OP_RDX23]], [[OP_RDX24]] -; CHECK-NEXT: [[OP_RDX27:%.*]] = mul i32 [[OP_RDX25]], [[OP_RDX26]] +; CHECK-NEXT: [[TMP5:%.*]] = call <28 x i32> @llvm.vector.extract.v28i32.v32i32(<32 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = mul <28 x i32> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v28i32(<32 x i32> [[TMP1]], <28 x i32> [[RDX_OP]], i64 0) +; CHECK-NEXT: [[OP_RDX27:%.*]] = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> [[TMP6]]) ; CHECK-NEXT: [[VAL64:%.*]] = add i32 3, [[OP_RDX27]] ; CHECK-NEXT: [[VAL65:%.*]] = sext i32 [[VAL64]] to i64 ; CHECK-NEXT: ret i64 [[VAL65]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll index e012cc60960b3..79c6c6b3f046f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll @@ -8,12 +8,12 @@ define i8 @test() { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 0 to i8 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 0 to i8 ; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 0 to i8 -; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i8 [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = or i8 [[OP_RDX]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = or i8 [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = or i8 [[OP_RDX3]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = or i8 [[OP_RDX3]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer) +; CHECK-NEXT: [[OP_RDX4:%.*]] = or i8 [[OP_RDX5]], [[TMP4]] ; CHECK-NEXT: ret i8 [[OP_RDX4]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll index c8d34a804e04a..232e458504188 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll @@ -14,7 +14,7 @@ define void @test() { ; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> , <4 x i1> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP6]], 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 0, [[TMP6]] ; CHECK-NEXT: store i64 [[OP_RDX]], ptr null, align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll index d1617c9a382d1..b25bf07067830 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-extracted-and-externally-used.ll @@ -8,23 +8,23 @@ define void @test(i32 %arg) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ] -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB1]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP6:%.*]], %[[BB1]] ] ; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX4:%.*]], %[[BB1]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[ADD17:%.*]] = add i32 [[PHI]], 0 -; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[PHI]], 0 -; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[PHI2]], 0 -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[PHI]], 0 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD4:%.*]] = add i32 [[PHI2]], 0 +; CHECK-NEXT: [[ADD23:%.*]] = add i32 [[PHI]], 0 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[PHI2]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[TMP0]], ; CHECK-NEXT: [[TMP5]] = extractelement <2 x i32> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP6]] = extractelement <2 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD17]] +; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP7]], [[ADD]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i32 [[ADD4]], [[ADD6]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD19]], [[TMP6]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i32 [[ADD23]], [[TMP6]] ; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i32 [[OP_RDX]], [[OP_RDX1]] ; CHECK-NEXT: [[OP_RDX4]] = xor i32 [[OP_RDX3]], [[OP_RDX2]] ; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i32 [[TMP5]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll index 6b0b22b90510c..5baa5f3cdcdae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-vectorized-later.ll @@ -4,9 +4,10 @@ define i16 @test() { ; CHECK-LABEL: define i16 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> zeroinitializer) -; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i16 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> zeroinitializer, i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i16> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> zeroinitializer, <4 x i16> [[RDX_OP]], i64 0) +; CHECK-NEXT: [[OP_RDX:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP1]]) ; CHECK-NEXT: [[OP_RDX1:%.*]] = or i16 [[OP_RDX]], 0 ; CHECK-NEXT: ret i16 [[OP_RDX1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll index a8ba9e059dc2e..1cf837df719ec 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll @@ -4,19 +4,15 @@ define i32 @foo() { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> zeroinitializer, i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> zeroinitializer, zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[RDX_OP]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 0, [[TMP5]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0 -; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[OP_RDX1]], [[OP_RDX2]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[OP_RDX3]], [[TMP2]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX4]], [[OP_RDX5]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX1]], [[TMP2]] ; CHECK-NEXT: ret i32 [[OP_RDX6]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll index dab37a7b05294..48b2174cac688 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll @@ -21,10 +21,10 @@ define void @test(i1 %arg, ptr %p) { ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds [100 x i32], ptr [[P]], i64 0, i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[I]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[TMP1]], 0 +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 0, [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[I1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP3]], 0 +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 0, [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[OP_RDX3]], 2 ; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 0, [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[OP_RDX2]], 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll index a552a24eb7b26..583b992ff392a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll @@ -9,8 +9,8 @@ define void @_Z2azv(ptr %p) local_unnamed_addr { ; CHECK-NEXT: [[DOTSROA_CAST_4:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", ptr [[P:%.*]], i64 4, i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[DOTSROA_CAST_4]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], 0 -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 0, [[TMP2]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 0, i32 [[TMP2]] ; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 false, i32 0, i32 [[OP_RDX1]] ; CHECK-NEXT: [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], 0 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll index 43ce36337f4df..f8a6c4dab3d51 100644 --- a/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/partial-register-extract.ll @@ -23,10 +23,11 @@ define i32 @test(i32 %v, ptr %p) { ; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[I8_I_I]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[I9_I_I]] ; CHECK-NEXT: [[TMP10:%.*]] = freeze <16 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP10]]) ; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP2]] -; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP11]], i1 true, i1 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[TMP10]], i64 0) +; CHECK-NEXT: [[RDX_OP:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> splat (i1 true), <4 x i1> [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v4i1(<16 x i1> [[TMP10]], <4 x i1> [[RDX_OP]], i64 0) +; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP13]]) ; CHECK-NEXT: [[AND252_US_I_24_I_I:%.*]] = select i1 [[OP_RDX]], i32 0, i32 0 ; CHECK-NEXT: br label %[[INC]] ; CHECK: [[INC]]: diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll index be9318e467174..2da65114eae04 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll @@ -7,9 +7,8 @@ define i32 @test() { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[RDX_OP:%.*]] = add <4 x i32> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX_OP]]) ; CHECK-NEXT: ret i32 [[OP_RDX]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll index 91c0c7ab42e77..05a14a8968626 100644 --- a/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll @@ -13,7 +13,7 @@ define void @D134605() { ; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = mul i16 [[TMP2]], 2 -; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP3]], poison +; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 poison, [[TMP3]] ; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX]], 2 ; CHECK-NEXT: [[CALL:%.*]] = call i16 @check_i16(i16 noundef 1, i16 noundef [[REASS_MUL24]], i16 noundef 5120) ; CHECK-NEXT: ret void