diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index a35bc7402d1a8..7bc8c4deae1a3 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -366,16 +366,20 @@ class MemoryDepChecker { struct DepDistanceStrideAndSizeInfo { const SCEV *Dist; - uint64_t StrideA; - uint64_t StrideB; + uint64_t MaxStride; + std::optional CommonStride; + bool ShouldRetryWithRuntimeCheck; uint64_t TypeByteSize; bool AIsWrite; bool BIsWrite; - DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA, - uint64_t StrideB, uint64_t TypeByteSize, - bool AIsWrite, bool BIsWrite) - : Dist(Dist), StrideA(StrideA), StrideB(StrideB), + DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t MaxStride, + std::optional CommonStride, + bool ShouldRetryWithRuntimeCheck, + uint64_t TypeByteSize, bool AIsWrite, + bool BIsWrite) + : Dist(Dist), MaxStride(MaxStride), CommonStride(CommonStride), + ShouldRetryWithRuntimeCheck(ShouldRetryWithRuntimeCheck), TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {} }; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 71582d5d86549..6e3e1e8900c70 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1799,8 +1799,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { /// } static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, const SCEV &MaxBTC, const SCEV &Dist, - uint64_t MaxStride, - uint64_t TypeByteSize) { + uint64_t MaxStride) { // If we can prove that // (**) |Dist| > MaxBTC * Step @@ -1819,8 +1818,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, // will be executed only if LoopCount >= VF, proving distance >= LoopCount // also guarantees that distance >= VF. // - const uint64_t ByteStride = MaxStride * TypeByteSize; - const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride); + const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride); const SCEV *Product = SE.getMulExpr(&MaxBTC, Step); const SCEV *CastedDist = &Dist; @@ -1864,9 +1862,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, if (Distance % TypeByteSize) return false; - uint64_t ScaledDist = Distance / TypeByteSize; - - // No dependence if the scaled distance is not multiple of the stride. + // No dependence if the distance is not multiple of the stride. // E.g. // for (i = 0; i < 1024 ; i += 4) // A[i+2] = A[i] + 1; @@ -1882,7 +1878,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, // Two accesses in memory (scaled distance is 4, stride is 3): // | A[0] | | | A[3] | | | A[6] | | | // | | | | | A[4] | | | A[7] | | - return ScaledDist % Stride; + return Distance % Stride; } std::variantisZero() && AStoreSz != BStoreSz) + return MemoryDepChecker::Dependence::Unknown; + + // We can't get get a uint64_t for the AllocSize if either of the store sizes + // are scalable. + if (AStoreSz.isScalable() || BStoreSz.isScalable()) + return MemoryDepChecker::Dependence::Unknown; + + // The TypeByteSize is used to scale Distance and VF. In these contexts, the + // only size that matters is the size of the Sink. + uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy).value()), + TypeByteSize = alignTo(BStoreSz, DL.getABITypeAlign(BTy).value()); + + // We scale the strides by the alloc-type-sizes, so we can check that the + // common distance is equal when ASz != BSz. + int64_t StrideAScaled = *StrideAPtr * ASz; + int64_t StrideBScaled = *StrideBPtr * TypeByteSize; + // At least Src or Sink are loop invariant and the other is strided or // invariant. We can generate a runtime check to disambiguate the accesses. - if (!StrideAPtrInt || !StrideBPtrInt) + if (!StrideAScaled || !StrideBScaled) return MemoryDepChecker::Dependence::Unknown; // Both Src and Sink have a constant stride, check if they are in the same // direction. - if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) { + if ((StrideAScaled > 0) != (StrideBScaled > 0)) { LLVM_DEBUG( dbgs() << "Pointer access with strides in different directions\n"); return MemoryDepChecker::Dependence::Unknown; } - uint64_t TypeByteSize = DL.getTypeAllocSize(ATy); - bool HasSameSize = - DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy); - if (!HasSameSize) - TypeByteSize = 0; - return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt), - std::abs(StrideBPtrInt), TypeByteSize, + StrideAScaled = std::abs(StrideAScaled); + StrideBScaled = std::abs(StrideBScaled); + + // MaxStride is the max of the scaled strides, as expected. + uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled); + + // CommonStride is set if both scaled strides are equal. + std::optional CommonStride; + if (StrideAScaled == StrideBScaled) + CommonStride = StrideAScaled; + + // TODO: Historically, we don't retry with runtime checks unless the unscaled + // strides are the same, but this doesn't make sense. Fix this once the + // condition for runtime checks in isDependent is fixed. + bool ShouldRetryWithRuntimeCheck = + std::abs(*StrideAPtr) == std::abs(*StrideBPtr); + + return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, + ShouldRetryWithRuntimeCheck, TypeByteSize, AIsWrite, BIsWrite); } @@ -2011,32 +2046,28 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, if (std::holds_alternative(Res)) return std::get(Res); - auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] = + auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck, + TypeByteSize, AIsWrite, BIsWrite] = std::get(Res); - bool HasSameSize = TypeByteSize > 0; - std::optional CommonStride = - StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt; if (isa(Dist)) { - // TODO: Relax requirement that there is a common stride to retry with - // non-constant distance dependencies. - FoundNonConstantDistanceDependence |= CommonStride.has_value(); + // TODO: Relax requirement that there is a common unscaled stride to retry + // with non-constant distance dependencies. + FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n"); return Dependence::Unknown; } ScalarEvolution &SE = *PSE.getSE(); auto &DL = InnermostLoop->getHeader()->getDataLayout(); - uint64_t MaxStride = std::max(StrideA, StrideB); // If the distance between the acecsses is larger than their maximum absolute // stride multiplied by the symbolic maximum backedge taken count (which is an // upper bound of the number of iterations), the accesses are independet, i.e. // they are far enough appart that accesses won't access the same location // across all loop ierations. - if (HasSameSize && isSafeDependenceDistance( - DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), - *Dist, MaxStride, TypeByteSize)) + if (isSafeDependenceDistance( + DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride)) return Dependence::NoDep; const SCEVConstant *ConstDist = dyn_cast(Dist); @@ -2047,7 +2078,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // If the distance between accesses and their strides are known constants, // check whether the accesses interlace each other. - if (Distance > 0 && CommonStride && CommonStride > 1 && HasSameSize && + if (Distance > 0 && CommonStride && CommonStride > 1 && areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) { LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); return Dependence::NoDep; @@ -2061,15 +2092,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // Negative distances are not plausible dependencies. if (SE.isKnownNonPositive(Dist)) { - if (SE.isKnownNonNegative(Dist)) { - if (HasSameSize) { - // Write to the same location with the same size. - return Dependence::Forward; - } - LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " - "different type sizes\n"); - return Dependence::Unknown; - } + if (SE.isKnownNonNegative(Dist)) + // Write to the same location. + return Dependence::Forward; bool IsTrueDataDependence = (AIsWrite && !BIsWrite); // Check if the first access writes to a location that is read in a later @@ -2084,13 +2109,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, if (!ConstDist) { // TODO: FoundNonConstantDistanceDependence is used as a necessary // condition to consider retrying with runtime checks. Historically, we - // did not set it when strides were different but there is no inherent - // reason to. - FoundNonConstantDistanceDependence |= CommonStride.has_value(); + // did not set it when unscaled strides were different but there is no + // inherent reason to. + FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; return Dependence::Unknown; } - if (!HasSameSize || - couldPreventStoreLoadForward( + if (couldPreventStoreLoadForward( ConstDist->getAPInt().abs().getZExtValue(), TypeByteSize)) { LLVM_DEBUG( dbgs() << "LAA: Forward but may prevent st->ld forwarding\n"); @@ -2105,27 +2129,20 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue(); // Below we only handle strictly positive distances. if (MinDistance <= 0) { - FoundNonConstantDistanceDependence |= CommonStride.has_value(); + FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; return Dependence::Unknown; } - if (!ConstDist) { + if (!ConstDist) // Previously this case would be treated as Unknown, possibly setting // FoundNonConstantDistanceDependence to force re-trying with runtime // checks. Until the TODO below is addressed, set it here to preserve // original behavior w.r.t. re-trying with runtime checks. // TODO: FoundNonConstantDistanceDependence is used as a necessary // condition to consider retrying with runtime checks. Historically, we - // did not set it when strides were different but there is no inherent - // reason to. - FoundNonConstantDistanceDependence |= CommonStride.has_value(); - } - - if (!HasSameSize) { - LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " - "different type sizes\n"); - return Dependence::Unknown; - } + // did not set it when unscaled strides were different but there is no + // inherent reason to. + FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck; if (!CommonStride) return Dependence::Unknown; @@ -2140,8 +2157,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // It's not vectorizable if the distance is smaller than the minimum distance // needed for a vectroized/unrolled version. Vectorizing one iteration in - // front needs TypeByteSize * Stride. Vectorizing the last iteration needs - // TypeByteSize (No need to plus the last gap distance). + // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize + // (No need to plus the last gap distance). // // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. // foo(int *A) { @@ -2168,8 +2185,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // We know that Dist is positive, but it may not be constant. Use the signed // minimum for computations below, as this ensures we compute the closest // possible dependence distance. - uint64_t MinDistanceNeeded = - TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize; + uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize; if (MinDistanceNeeded > static_cast(MinDistance)) { if (!ConstDist) { // For non-constant distances, we checked the lower bound of the @@ -2225,7 +2241,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits // since there is a backwards dependency. - uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride); + uint64_t MaxVF = MinDepDistBytes / *CommonStride; LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance << " with max VF = " << MaxVF << '\n'); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll index 0bdcc35790148..cf1098f33fd65 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll @@ -129,16 +129,8 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) { ; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence' ; CHECK-NEXT: loop: ; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding. ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> -; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8 -; CHECK-EMPTY: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 -> -; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8 -; CHECK-EMPTY: ; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: ; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 -> ; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll index adfd19923e921..7837c20f003e2 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll @@ -70,10 +70,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) { ; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> ; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1 ; CHECK-EMPTY: -; CHECK-NEXT: Forward: -; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 -> -; CHECK-NEXT: store i24 %l, ptr %ptr.iv, align 1 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: ; CHECK-EMPTY: diff --git a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll index 08e0bae7f05ba..ffd4619fd88dc 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll @@ -3,26 +3,13 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; TODO: No runtime checks should be needed, as the distance between accesses -; is large enough to need runtime checks. define void @test_distance_positive_independent_via_trip_count(ptr %A) { ; CHECK-LABEL: 'test_distance_positive_independent_via_trip_count' ; CHECK-NEXT: loop: -; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Memory dependences are safe ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Check 0: -; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): -; CHECK-NEXT: %gep.A.400 = getelementptr inbounds i32, ptr %A.400, i64 %iv -; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): -; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv ; CHECK-NEXT: Grouped accesses: -; CHECK-NEXT: Group [[GRP1]]: -; CHECK-NEXT: (Low: (400 + %A) High: (804 + %A)) -; CHECK-NEXT: Member: {(400 + %A),+,4}<%loop> -; CHECK-NEXT: Group [[GRP2]]: -; CHECK-NEXT: (Low: %A High: (101 + %A)) -; CHECK-NEXT: Member: {%A,+,1}<%loop> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: @@ -41,7 +28,7 @@ loop: %ext = zext i8 %l to i32 store i32 %ext, ptr %gep.A.400, align 4 %iv.next = add nuw nsw i64 %iv, 1 - %ec = icmp eq i64 %iv, 100 + %ec = icmp eq i64 %iv.next, 100 br i1 %ec, label %exit, label %loop exit: @@ -57,16 +44,16 @@ define void @test_distance_positive_backwards(ptr %A) { ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Check 0: -; CHECK-NEXT: Comparing group ([[GRP3:0x[0-9a-f]+]]): +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): ; CHECK-NEXT: %gep.A.400 = getelementptr inbounds i32, ptr %A.1, i64 %iv -; CHECK-NEXT: Against group ([[GRP4:0x[0-9a-f]+]]): +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): ; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv ; CHECK-NEXT: Grouped accesses: -; CHECK-NEXT: Group [[GRP3]]: -; CHECK-NEXT: (Low: (1 + %A) High: (405 + %A)) +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: (1 + %A) High: (401 + %A)) ; CHECK-NEXT: Member: {(1 + %A),+,4}<%loop> -; CHECK-NEXT: Group [[GRP4]]: -; CHECK-NEXT: (Low: %A High: (101 + %A)) +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %A High: (100 + %A)) ; CHECK-NEXT: Member: {%A,+,1}<%loop> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. @@ -86,7 +73,7 @@ loop: %ext = zext i8 %l to i32 store i32 %ext, ptr %gep.A.400, align 4 %iv.next = add nuw nsw i64 %iv, 1 - %ec = icmp eq i64 %iv, 100 + %ec = icmp eq i64 %iv.next, 100 br i1 %ec, label %exit, label %loop exit: @@ -100,16 +87,16 @@ define void @test_distance_positive_via_assume(ptr %A, i64 %off) { ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Check 0: -; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: Comparing group ([[GRP3:0x[0-9a-f]+]]): ; CHECK-NEXT: %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv -; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: Against group ([[GRP4:0x[0-9a-f]+]]): ; CHECK-NEXT: %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv ; CHECK-NEXT: Grouped accesses: -; CHECK-NEXT: Group [[GRP5]]: -; CHECK-NEXT: (Low: (%off + %A) High: (404 + %off + %A)) +; CHECK-NEXT: Group [[GRP3]]: +; CHECK-NEXT: (Low: (%off + %A) High: (400 + %off + %A)) ; CHECK-NEXT: Member: {(%off + %A),+,4}<%loop> -; CHECK-NEXT: Group [[GRP6]]: -; CHECK-NEXT: (Low: %A High: (101 + %A)) +; CHECK-NEXT: Group [[GRP4]]: +; CHECK-NEXT: (Low: %A High: (100 + %A)) ; CHECK-NEXT: Member: {%A,+,1}<%loop> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. @@ -131,7 +118,7 @@ loop: %ext = zext i8 %l to i32 store i32 %ext, ptr %gep.A.400, align 4 %iv.next = add nuw nsw i64 %iv, 1 - %ec = icmp eq i64 %iv, 100 + %ec = icmp eq i64 %iv.next, 100 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll index 79d7ab84b3a0f..e3dd48114e3f8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll @@ -96,17 +96,42 @@ exit: define void @pr58722_store_interleave_group(ptr %src, ptr %dst) { ; CHECK-LABEL: @pr58722_store_interleave_group( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP1]] +; CHECK-NEXT: store i32 [[TMP0]], ptr [[TMP2]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP0]] to i24 +; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP1]] to i24 +; CHECK-NEXT: store i24 [[TMP6]], ptr [[TMP4]], align 4 +; CHECK-NEXT: store i24 [[TMP7]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 5000 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_IV]], align 4 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[GEP_IV]], i64 1 ; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i32 [[IV]] to i24 ; CHECK-NEXT: store i24 [[TRUNC_IV]], ptr [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV]], 10000 -; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ;