Skip to content

Commit 1949536

Browse files
committed
[VPlan] Also visit VPBBs outside loop region when unrolling by VF.
Make sure all VPBBs outside the top-level loop region and directly inside the region are visited; all those blocks may contain VPReplicateRecipes that need unrolling. This makes sure we unroll VPRepicateRecipes by VF if they are hoisted out of the loop, but cannot be converted to single scalar recipes yet.
1 parent d93aff4 commit 1949536

File tree

2 files changed

+82
-2
lines changed

2 files changed

+82
-2
lines changed

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,8 +493,16 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
493493
void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
494494
Type *IdxTy = IntegerType::get(
495495
Plan.getScalarHeader()->getIRBasicBlock()->getContext(), 32);
496-
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
497-
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
496+
497+
// Visit all VPBBs outside the loop region and directly inside the top-level
498+
// loop region.
499+
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
500+
vp_depth_first_shallow(Plan.getEntry()));
501+
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
502+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
503+
auto VPBBsToUnroll =
504+
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
505+
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
498506
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
499507
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
500508
if (!RepR || RepR->isSingleScalar())
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
2+
; RUN: opt -p loop-vectorize -S %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define void @licm_replicate_call(double %x, ptr %dst) {
7+
; CHECK-LABEL: define void @licm_replicate_call(
8+
; CHECK-SAME: double [[X:%.*]], ptr [[DST:%.*]]) {
9+
; CHECK-NEXT: [[ENTRY:.*]]:
10+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
11+
; CHECK: [[VECTOR_PH]]:
12+
; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
13+
; CHECK-NEXT: [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
14+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
15+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
16+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
17+
; CHECK: [[VECTOR_BODY]]:
18+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
19+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
20+
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
21+
; CHECK-NEXT: [[TMP4:%.*]] = uitofp <2 x i32> [[VEC_IND]] to <2 x double>
22+
; CHECK-NEXT: [[TMP5:%.*]] = uitofp <2 x i32> [[STEP_ADD]] to <2 x double>
23+
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP4]]
24+
; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]]
25+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
26+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i32 0
27+
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i32 2
28+
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[TMP9]], align 8
29+
; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP10]], align 8
30+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
31+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
32+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
33+
; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
34+
; CHECK: [[MIDDLE_BLOCK]]:
35+
; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
36+
; CHECK: [[SCALAR_PH]]:
37+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
38+
; CHECK-NEXT: br label %[[LOOP:.*]]
39+
; CHECK: [[LOOP]]:
40+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
41+
; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
42+
; CHECK-NEXT: [[IV_AS_FP:%.*]] = uitofp i32 [[IV_TRUNC]] to double
43+
; CHECK-NEXT: [[P:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
44+
; CHECK-NEXT: [[MUL:%.*]] = fmul double [[P]], [[IV_AS_FP]]
45+
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[IV]]
46+
; CHECK-NEXT: store double [[MUL]], ptr [[GEP_DST]], align 8
47+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
48+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 128
49+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
50+
; CHECK: [[EXIT]]:
51+
; CHECK-NEXT: ret void
52+
;
53+
entry:
54+
br label %loop
55+
56+
loop:
57+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
58+
%iv.trunc = trunc i64 %iv to i32
59+
%iv.as.fp = uitofp i32 %iv.trunc to double
60+
%p = tail call double @llvm.pow.f64(double %x, double 3.000000e+00)
61+
%mul = fmul double %p, %iv.as.fp
62+
%gep.dst = getelementptr inbounds double, ptr %dst, i64 %iv
63+
store double %mul, ptr %gep.dst, align 8
64+
%iv.next = add i64 %iv, 1
65+
%ec = icmp eq i64 %iv, 128
66+
br i1 %ec, label %exit, label %loop
67+
68+
exit:
69+
ret void
70+
}
71+
72+
declare double @llvm.pow.f64(double, double)

0 commit comments

Comments
 (0)