diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c3d1416ed518d..1e14eb6f4c3d9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6611,6 +6611,30 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, // better than going through the stack, as the default expansion does. SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) + ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget); + + // Recursively split concat_vectors with more than 2 operands: + // + // concat_vector op1, op2, op3, op4 + // -> + // concat_vector (concat_vector op1, op2), (concat_vector op3, op4) + // + // This reduces the length of the chain of vslideups and allows us to + // perform the vslideups at a smaller LMUL, limited to MF2. + if (Op.getNumOperands() > 2 && + ContainerVT.bitsGE(getLMUL1VT(ContainerVT))) { + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + assert(isPowerOf2_32(Op.getNumOperands())); + size_t HalfNumOps = Op.getNumOperands() / 2; + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, + Op->ops().take_front(HalfNumOps)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, + Op->ops().drop_front(HalfNumOps)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + unsigned NumOpElts = Op.getOperand(0).getSimpleValueType().getVectorMinNumElements(); SDValue Vec = DAG.getUNDEF(VT); diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 87d95d7596d4f..139579b3d2a36 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -161,72 +161,71 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv128: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v0, v16, a2 -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_1) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 4 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v10, v16, a2 +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v8, v16, a2 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_2) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 6 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_3) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 8 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 6 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI10_4) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 10 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v0, v9, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_5) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; CHECK-NEXT: vslideup.vi v0, v16, 12 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v0, v9, 4 ; CHECK-NEXT: lui a0, %hi(.LCPI10_6) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6) ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsext.vf8 v16, v8 -; CHECK-NEXT: vsaddu.vx v8, v16, a1 -; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v0, v16, 14 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsext.vf8 v16, v9 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v9, v16, a2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v0, v9, 6 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vi v0, v8, 8 ; CHECK-NEXT: ret %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc) ret <128 x i1> %mask diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll index c64216180c2af..ed434deea1a83 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll @@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) { ; RV32-NEXT: th.swia a0, (a1), 4, 0 ; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV32-NEXT: vle8.v v10, (a3) -; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v10, v9, 4 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vzext.vf4 v12, v10 @@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) { ; RV64-NEXT: th.swia a0, (a1), 4, 0 ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vle8.v v10, (a3) -; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v10, v9, 4 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vzext.vf4 v12, v10 diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll index 76aa2b913c652..e15e6452163b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -469,9 +469,8 @@ define @extract_nxv6f16_nxv12f16_6( %in) ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v13, v10, a0 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v13, v10, a0 ; CHECK-NEXT: vslidedown.vx v12, v9, a0 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index e5bef20fd9e24..98e6b8f2dd760 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -19,14 +19,11 @@ define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) { define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; CHECK-LABEL: concat_4xv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v11 -; CHECK-NEXT: vmv1r.v v14, v9 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v14, 2 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 6 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> @@ -37,24 +34,18 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) { ; CHECK-LABEL: concat_8xv1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v16, v15 -; CHECK-NEXT: vmv1r.v v18, v13 -; CHECK-NEXT: vmv1r.v v20, v11 -; CHECK-NEXT: vmv1r.v v22, v9 -; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v22, 1 -; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v14, v15, 1 +; CHECK-NEXT: vslideup.vi v12, v13, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vslideup.vi v8, v10, 2 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v20, 3 -; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 4 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v18, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v14, 6 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 7 +; CHECK-NEXT: vslideup.vi v8, v12, 4 ; CHECK-NEXT: ret %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> @@ -80,15 +71,14 @@ define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) { define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; CHECK-LABEL: concat_4xv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v12, v11 -; CHECK-NEXT: vmv1r.v v16, v10 -; CHECK-NEXT: vmv1r.v v20, v9 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v20, 4 -; CHECK-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v16, 8 +; CHECK-NEXT: vmv1r.v v14, v11 +; CHECK-NEXT: vmv1r.v v12, v10 +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 12 +; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> @@ -99,26 +89,18 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) { ; CHECK-LABEL: concat_8xv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v16, v15 -; CHECK-NEXT: vmv1r.v v20, v14 -; CHECK-NEXT: vmv1r.v v24, v13 -; CHECK-NEXT: vmv1r.v v28, v11 -; CHECK-NEXT: vmv1r.v v4, v10 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v0, 2 -; CHECK-NEXT: vsetivli zero, 6, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v4, 4 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v28, 6 -; CHECK-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v24, 10 -; CHECK-NEXT: vsetivli zero, 14, e32, m4, tu, ma -; CHECK-NEXT: vslideup.vi v8, v20, 12 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v14, v15, 2 +; CHECK-NEXT: vslideup.vi v12, v13, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 14 +; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> @@ -152,29 +134,27 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) { define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; VLA-LABEL: concat_4xv8i32: ; VLA: # %bb.0: -; VLA-NEXT: vmv2r.v v16, v14 -; VLA-NEXT: vmv2r.v v24, v12 -; VLA-NEXT: vmv2r.v v0, v10 -; VLA-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v0, 8 -; VLA-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v24, 16 +; VLA-NEXT: vmv2r.v v20, v14 +; VLA-NEXT: vmv2r.v v16, v12 +; VLA-NEXT: vmv2r.v v12, v10 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v16, v20, 8 +; VLA-NEXT: vslideup.vi v8, v12, 8 ; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; VLA-NEXT: vslideup.vi v8, v16, 24 +; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret ; ; VLS-LABEL: concat_4xv8i32: ; VLS: # %bb.0: -; VLS-NEXT: vmv2r.v v16, v14 -; VLS-NEXT: vmv2r.v v24, v12 -; VLS-NEXT: vmv2r.v v0, v10 -; VLS-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v0, 8 -; VLS-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v24, 16 +; VLS-NEXT: vmv2r.v v20, v14 +; VLS-NEXT: vmv2r.v v16, v12 +; VLS-NEXT: vmv2r.v v12, v10 +; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLS-NEXT: vslideup.vi v16, v20, 8 +; VLS-NEXT: vslideup.vi v8, v12, 8 ; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: vslideup.vi v8, v16, 24 +; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> @@ -185,123 +165,49 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) { ; VLA-LABEL: concat_8xv4i32: ; VLA: # %bb.0: -; VLA-NEXT: addi sp, sp, -16 -; VLA-NEXT: .cfi_def_cfa_offset 16 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 5 -; VLA-NEXT: sub sp, sp, a0 -; VLA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; VLA-NEXT: vmv1r.v v16, v15 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: mv a1, a0 -; VLA-NEXT: slli a0, a0, 1 -; VLA-NEXT: add a0, a0, a1 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLA-NEXT: vmv1r.v v16, v14 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 4 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLA-NEXT: vmv1r.v v16, v13 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; VLA-NEXT: vmv1r.v v18, v15 +; VLA-NEXT: vmv1r.v v20, v14 +; VLA-NEXT: vmv1r.v v22, v13 ; VLA-NEXT: vmv1r.v v16, v12 -; VLA-NEXT: addi a0, sp, 16 -; VLA-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLA-NEXT: vmv1r.v v0, v11 -; VLA-NEXT: vmv1r.v v24, v10 -; VLA-NEXT: vmv1r.v v16, v9 -; VLA-NEXT: vsetivli zero, 8, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v16, 4 -; VLA-NEXT: vsetivli zero, 12, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v24, 8 -; VLA-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLA-NEXT: vslideup.vi v8, v0, 12 -; VLA-NEXT: vsetivli zero, 20, e32, m8, tu, ma -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 16 -; VLA-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 20 -; VLA-NEXT: vsetivli zero, 28, e32, m8, tu, ma -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 4 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 24 +; VLA-NEXT: vmv1r.v v14, v11 +; VLA-NEXT: vmv1r.v v12, v10 +; VLA-NEXT: vmv1r.v v10, v9 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v20, v18, 4 +; VLA-NEXT: vslideup.vi v16, v22, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v16, v20, 8 +; VLA-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLA-NEXT: vslideup.vi v12, v14, 4 +; VLA-NEXT: vslideup.vi v8, v10, 4 +; VLA-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLA-NEXT: vslideup.vi v8, v12, 8 ; VLA-NEXT: li a0, 32 ; VLA-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 3 -; VLA-NEXT: mv a1, a0 -; VLA-NEXT: slli a0, a0, 1 -; VLA-NEXT: add a0, a0, a1 -; VLA-NEXT: add a0, sp, a0 -; VLA-NEXT: addi a0, a0, 16 -; VLA-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLA-NEXT: vslideup.vi v8, v16, 28 -; VLA-NEXT: csrr a0, vlenb -; VLA-NEXT: slli a0, a0, 5 -; VLA-NEXT: add sp, sp, a0 -; VLA-NEXT: addi sp, sp, 16 +; VLA-NEXT: vslideup.vi v8, v16, 16 ; VLA-NEXT: ret ; ; VLS-LABEL: concat_8xv4i32: ; VLS: # %bb.0: -; VLS-NEXT: addi sp, sp, -16 -; VLS-NEXT: .cfi_def_cfa_offset 16 -; VLS-NEXT: addi sp, sp, -512 -; VLS-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb -; VLS-NEXT: vmv1r.v v16, v15 -; VLS-NEXT: addi a0, sp, 400 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLS-NEXT: vmv1r.v v16, v14 -; VLS-NEXT: addi a0, sp, 272 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLS-NEXT: vmv1r.v v16, v13 -; VLS-NEXT: addi a0, sp, 144 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; VLS-NEXT: vmv1r.v v18, v15 +; VLS-NEXT: vmv1r.v v20, v14 +; VLS-NEXT: vmv1r.v v22, v13 ; VLS-NEXT: vmv1r.v v16, v12 -; VLS-NEXT: addi a0, sp, 16 -; VLS-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill -; VLS-NEXT: vmv1r.v v0, v11 -; VLS-NEXT: vmv1r.v v24, v10 -; VLS-NEXT: vmv1r.v v16, v9 -; VLS-NEXT: vsetivli zero, 8, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v16, 4 -; VLS-NEXT: vsetivli zero, 12, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v24, 8 -; VLS-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; VLS-NEXT: vslideup.vi v8, v0, 12 -; VLS-NEXT: vsetivli zero, 20, e32, m8, tu, ma -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 16 -; VLS-NEXT: vsetivli zero, 24, e32, m8, tu, ma -; VLS-NEXT: addi a0, sp, 144 -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 20 -; VLS-NEXT: vsetivli zero, 28, e32, m8, tu, ma -; VLS-NEXT: addi a0, sp, 272 -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 24 +; VLS-NEXT: vmv1r.v v14, v11 +; VLS-NEXT: vmv1r.v v12, v10 +; VLS-NEXT: vmv1r.v v10, v9 +; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLS-NEXT: vslideup.vi v20, v18, 4 +; VLS-NEXT: vslideup.vi v16, v22, 4 +; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLS-NEXT: vslideup.vi v16, v20, 8 +; VLS-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; VLS-NEXT: vslideup.vi v12, v14, 4 +; VLS-NEXT: vslideup.vi v8, v10, 4 +; VLS-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; VLS-NEXT: vslideup.vi v8, v12, 8 ; VLS-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; VLS-NEXT: addi a0, sp, 400 -; VLS-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; VLS-NEXT: vslideup.vi v8, v16, 28 -; VLS-NEXT: addi sp, sp, 512 -; VLS-NEXT: addi sp, sp, 16 +; VLS-NEXT: vslideup.vi v8, v16, 16 ; VLS-NEXT: ret %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index 4ec2e59672ad6..657d52354aa39 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -27,13 +27,14 @@ define void @widen_3xv4i16(ptr %x, ptr %z) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 8 -; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v9, (a2) ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: vsetivli zero, 12, e16, m2, ta, ma ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -72,20 +73,18 @@ define void @widen_4xv4i16(ptr %x, ptr %z) { define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) { ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned: ; CHECK-NO-MISALIGN: # %bb.0: -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0) -; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 -; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) ; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16 -; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2) +; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2) +; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8 ; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24 -; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0) -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a0) +; CHECK-NO-MISALIGN-NEXT: vle8.v v11, (a2) +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v11, 4 ; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1) ; CHECK-NO-MISALIGN-NEXT: ret ; @@ -187,18 +186,17 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a2, a0, 2 -; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a2, a0, 6 -; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: addi a2, a0, 2 ; CHECK-NEXT: addi a0, a0, 8 -; CHECK-NEXT: vle16.v v14, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: vslideup.vi v8, v11, 4 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v14, 12 +; CHECK-NEXT: vslideup.vi v8, v10, 8 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x @@ -258,17 +256,16 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32-NEXT: vle16.v v8, (a0) ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v10, (a0) -; RV32-NEXT: add a0, a0, a4 -; RV32-NEXT: vle16.v v12, (a0) -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v14, (a0) -; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 4 -; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 8 +; RV32-NEXT: add a4, a0, a4 +; RV32-NEXT: vle16.v v10, (a4) +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: vle16.v v9, (a2) +; RV32-NEXT: vle16.v v11, (a0) +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vslideup.vi v10, v9, 4 +; RV32-NEXT: vslideup.vi v8, v11, 4 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v14, 12 +; RV32-NEXT: vslideup.vi v8, v10, 8 ; RV32-NEXT: vse16.v v8, (a1) ; RV32-NEXT: ret ; @@ -277,17 +274,16 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV64-NEXT: vle16.v v8, (a0) ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v10, (a0) -; RV64-NEXT: add a0, a0, a3 -; RV64-NEXT: vle16.v v12, (a0) -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v14, (a0) -; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 4 -; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 8 +; RV64-NEXT: add a3, a0, a3 +; RV64-NEXT: vle16.v v10, (a3) +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: vle16.v v9, (a2) +; RV64-NEXT: vle16.v v11, (a0) +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vslideup.vi v10, v9, 4 +; RV64-NEXT: vslideup.vi v8, v11, 4 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vslideup.vi v8, v14, 12 +; RV64-NEXT: vslideup.vi v8, v10, 8 ; RV64-NEXT: vse16.v v8, (a1) ; RV64-NEXT: ret ; @@ -296,17 +292,16 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVE64F-NEXT: vle16.v v8, (a0) ; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v10, (a0) -; ZVE64F-NEXT: add a0, a0, a3 -; ZVE64F-NEXT: vle16.v v12, (a0) -; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v14, (a0) -; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v10, 4 -; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v12, 8 +; ZVE64F-NEXT: add a3, a0, a3 +; ZVE64F-NEXT: vle16.v v10, (a3) +; ZVE64F-NEXT: add a2, a3, a2 +; ZVE64F-NEXT: vle16.v v9, (a2) +; ZVE64F-NEXT: vle16.v v11, (a0) +; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVE64F-NEXT: vslideup.vi v10, v9, 4 +; ZVE64F-NEXT: vslideup.vi v8, v11, 4 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVE64F-NEXT: vslideup.vi v8, v14, 12 +; ZVE64F-NEXT: vslideup.vi v8, v10, 8 ; ZVE64F-NEXT: vse16.v v8, (a1) ; ZVE64F-NEXT: ret %a = load <4 x i16>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index eb7894ede0464..b3bda5973eb8c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -441,57 +441,50 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -609,57 +602,50 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -787,60 +773,53 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vslideup.vi v8, v10, 2 ; CHECK-V-NEXT: li a0, -1 ; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 @@ -1404,90 +1383,125 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1682,90 +1696,125 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 -; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 +; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -1982,94 +2031,129 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vslideup.vi v8, v10, 4 ; CHECK-V-NEXT: lui a0, 16 ; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -3723,57 +3807,50 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -3889,57 +3966,50 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb @@ -4066,60 +4136,53 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 ; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; CHECK-V-NEXT: lhu s0, 24(a0) -; CHECK-V-NEXT: lhu s1, 16(a0) -; CHECK-V-NEXT: lhu s2, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu a0, 24(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e64, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e64, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add a0, sp, a0 ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 +; CHECK-V-NEXT: vslideup.vi v8, v10, 2 ; CHECK-V-NEXT: li a0, -1 ; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 @@ -4671,90 +4734,125 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -4947,90 +5045,125 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-V-NEXT: vmv.s.x v10, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v10, v8, 4 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload @@ -5246,94 +5379,129 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: .cfi_offset s5, -56 ; CHECK-V-NEXT: .cfi_offset s6, -64 ; CHECK-V-NEXT: csrr a1, vlenb -; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: slli a1, a1, 2 ; CHECK-V-NEXT: sub sp, sp, a1 -; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb -; CHECK-V-NEXT: lhu s0, 56(a0) -; CHECK-V-NEXT: lhu s1, 48(a0) -; CHECK-V-NEXT: lhu s2, 40(a0) -; CHECK-V-NEXT: lhu s3, 32(a0) -; CHECK-V-NEXT: lhu s4, 24(a0) -; CHECK-V-NEXT: lhu s5, 16(a0) -; CHECK-V-NEXT: lhu s6, 0(a0) -; CHECK-V-NEXT: lhu a0, 8(a0) +; CHECK-V-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb +; CHECK-V-NEXT: lhu s0, 0(a0) +; CHECK-V-NEXT: lhu s1, 8(a0) +; CHECK-V-NEXT: lhu s2, 16(a0) +; CHECK-V-NEXT: lhu s3, 24(a0) +; CHECK-V-NEXT: lhu s4, 32(a0) +; CHECK-V-NEXT: lhu s5, 40(a0) +; CHECK-V-NEXT: lhu s6, 48(a0) +; CHECK-V-NEXT: lhu a0, 56(a0) ; CHECK-V-NEXT: fmv.w.x fa0, a0 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz ; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s6 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 2, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v8, v10, 1 -; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s5 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 2 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma ; CHECK-V-NEXT: fmv.w.x fa0, s4 +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s3 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 5, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s2 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 4 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s2 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 5 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: fmv.w.x fa0, s1 ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 7, e32, m2, tu, ma +; CHECK-V-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-V-NEXT: fmv.w.x fa0, s0 ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 6 -; CHECK-V-NEXT: vs2r.v v10, (a0) # Unknown-size Folded Spill -; CHECK-V-NEXT: fmv.w.x fa0, s0 +; CHECK-V-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 ; CHECK-V-NEXT: addi a0, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 1 +; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 +; CHECK-V-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload +; CHECK-V-NEXT: vslideup.vi v8, v9, 2 +; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-V-NEXT: csrr a0, vlenb +; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: add a0, sp, a0 +; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload -; CHECK-V-NEXT: vslideup.vi v10, v8, 7 +; CHECK-V-NEXT: vslideup.vi v8, v10, 4 ; CHECK-V-NEXT: lui a0, 16 ; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 +; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 ; CHECK-V-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; CHECK-V-NEXT: ld s0, 64(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index f3ae03af7c786..0b236f6d3ff38 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2136,17 +2136,18 @@ define @mgather_baseidx_nxv32i8(ptr %base, ; RV64-NEXT: vluxei64.v v13, (a0), v24, v0.t ; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vx v0, v16, a1 -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vsext.vf8 v16, v10 -; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t +; RV64-NEXT: vslidedown.vx v8, v16, a1 ; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vslidedown.vx v0, v8, a2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma ; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, ptr %base, %idxs diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll index c27488b18a017..d13d67fd0a882 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll @@ -9,39 +9,38 @@ define <4 x float> @foo(ptr %0) nounwind { ; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: lhu s0, 6(a0) -; CHECK-NEXT: lhu s1, 4(a0) -; CHECK-NEXT: lhu s2, 0(a0) -; CHECK-NEXT: lhu a0, 2(a0) +; CHECK-NEXT: lhu s0, 0(a0) +; CHECK-NEXT: lhu s1, 2(a0) +; CHECK-NEXT: lhu s2, 4(a0) +; CHECK-NEXT: lhu a0, 6(a0) ; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 8(sp) +; CHECK-NEXT: fsw fa0, 4(sp) ; CHECK-NEXT: fmv.w.x fa0, s2 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 0(sp) +; CHECK-NEXT: fsw fa0, 12(sp) ; CHECK-NEXT: fmv.w.x fa0, s1 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 12(sp) +; CHECK-NEXT: fsw fa0, 8(sp) ; CHECK-NEXT: fmv.w.x fa0, s0 ; CHECK-NEXT: call __extendhfsf2 -; CHECK-NEXT: fsw fa0, 4(sp) -; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: fsw fa0, 0(sp) +; CHECK-NEXT: addi a0, sp, 4 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: mv a0, sp ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: addi a0, sp, 12 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 -; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: addi a0, sp, 8 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vslideup.vi v8, v9, 2 ; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload