From f9ef5600fe2defa4e0c6a19303562de92051f613 Mon Sep 17 00:00:00 2001 From: Guozhi Wei Date: Tue, 1 Apr 2025 18:55:43 +0000 Subject: [PATCH 1/2] [InlineSpiller] Check rematerialization before folding operand Current implementation tries to fold the operand before rematerialization because it can reduce one register usage. But if there is a physical register available we can still rematerialize it without causing high register pressure. This patch do this check to find the better choice. --- llvm/include/llvm/CodeGen/Spiller.h | 6 +- llvm/lib/CodeGen/InlineSpiller.cpp | 38 +++- llvm/lib/CodeGen/RegAllocGreedy.cpp | 5 +- llvm/test/CodeGen/X86/avx-cmp.ll | 3 +- llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll | 3 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 3 +- .../test/CodeGen/X86/fptosi-sat-vector-128.ll | 21 +- .../test/CodeGen/X86/fptoui-sat-vector-128.ll | 188 +++++++++++------- llvm/test/CodeGen/X86/mmx-fold-zero.ll | 6 +- .../vector-interleaved-load-i8-stride-7.ll | 9 +- 10 files changed, 185 insertions(+), 97 deletions(-) diff --git a/llvm/include/llvm/CodeGen/Spiller.h b/llvm/include/llvm/CodeGen/Spiller.h index 3132cefeb6c68..84fc872a07606 100644 --- a/llvm/include/llvm/CodeGen/Spiller.h +++ b/llvm/include/llvm/CodeGen/Spiller.h @@ -23,6 +23,7 @@ class LiveIntervals; class LiveStacks; class MachineDominatorTree; class MachineBlockFrequencyInfo; +class AllocationOrder; /// Spiller interface. /// @@ -35,7 +36,7 @@ class Spiller { virtual ~Spiller() = 0; /// spill - Spill the LRE.getParent() live interval. - virtual void spill(LiveRangeEdit &LRE) = 0; + virtual void spill(LiveRangeEdit &LRE, AllocationOrder *Order = nullptr) = 0; /// Return the registers that were spilled. virtual ArrayRef getSpilledRegs() = 0; @@ -58,7 +59,8 @@ class Spiller { /// of deferring though VirtRegMap. Spiller *createInlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF, VirtRegMap &VRM, - VirtRegAuxInfo &VRAI); + VirtRegAuxInfo &VRAI, + LiveRegMatrix *Matrix = nullptr); } // end namespace llvm diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 920873c739f46..f384740be2e33 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "AllocationOrder.h" #include "SplitKit.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -23,6 +24,7 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" +#include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" @@ -149,12 +151,14 @@ class InlineSpiller : public Spiller { MachineRegisterInfo &MRI; const TargetInstrInfo &TII; const TargetRegisterInfo &TRI; + LiveRegMatrix *Matrix = nullptr; // Variables that are valid during spill(), but used by multiple methods. LiveRangeEdit *Edit = nullptr; LiveInterval *StackInt = nullptr; int StackSlot; Register Original; + AllocationOrder *Order = nullptr; // All registers to spill to StackSlot, including the main register. SmallVector RegsToSpill; @@ -184,13 +188,13 @@ class InlineSpiller : public Spiller { public: InlineSpiller(const Spiller::RequiredAnalyses &Analyses, MachineFunction &MF, - VirtRegMap &VRM, VirtRegAuxInfo &VRAI) + VirtRegMap &VRM, VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix) : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), HSpiller(Analyses, MF, VRM), - VRAI(VRAI) {} + TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix), + HSpiller(Analyses, MF, VRM), VRAI(VRAI) {} - void spill(LiveRangeEdit &) override; + void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override; ArrayRef getSpilledRegs() override { return RegsToSpill; } ArrayRef getReplacedRegs() override { return RegsReplaced; } void postOptimization() override; @@ -207,6 +211,7 @@ class InlineSpiller : public Spiller { void markValueUsed(LiveInterval*, VNInfo*); bool canGuaranteeAssignmentAfterRemat(Register VReg, MachineInstr &MI); + bool hasPhysRegAvailable(const MachineInstr &MI); bool reMaterializeFor(LiveInterval &, MachineInstr &MI); void reMaterializeAll(); @@ -229,8 +234,8 @@ void Spiller::anchor() {} Spiller * llvm::createInlineSpiller(const InlineSpiller::RequiredAnalyses &Analyses, MachineFunction &MF, VirtRegMap &VRM, - VirtRegAuxInfo &VRAI) { - return new InlineSpiller(Analyses, MF, VRM, VRAI); + VirtRegAuxInfo &VRAI, LiveRegMatrix *Matrix) { + return new InlineSpiller(Analyses, MF, VRM, VRAI, Matrix); } //===----------------------------------------------------------------------===// @@ -615,6 +620,23 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg, return true; } +/// hasPhysRegAvailable - Check if there is an available physical register for +/// rematerialization. +bool InlineSpiller::hasPhysRegAvailable(const MachineInstr &MI) { + if (!Order || !Matrix) + return false; + + SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true); + SlotIndex PrevIdx = UseIdx.getPrevSlot(); + + for (MCPhysReg PhysReg : *Order) { + if (!Matrix->checkInterference(PrevIdx, UseIdx, PhysReg)) + return true; + } + + return false; +} + /// reMaterializeFor - Attempt to rematerialize before MI instead of reloading. bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // Analyze instruction @@ -661,6 +683,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { // Before rematerializing into a register for a single instruction, try to // fold a load into the instruction. That avoids allocating a new register. if (RM.OrigMI->canFoldAsLoad() && + (RM.OrigMI->mayLoad() || !hasPhysRegAvailable(MI)) && foldMemoryOperand(Ops, RM.OrigMI)) { Edit->markRematerialized(RM.ParentVNI); ++NumFoldedLoads; @@ -1282,9 +1305,10 @@ void InlineSpiller::spillAll() { Edit->eraseVirtReg(Reg); } -void InlineSpiller::spill(LiveRangeEdit &edit) { +void InlineSpiller::spill(LiveRangeEdit &edit, AllocationOrder *order) { ++NumSpilledRanges; Edit = &edit; + Order = order; assert(!edit.getReg().isStack() && "Trying to spill a stack slot."); // Share a stack slot among all descendants of Original. Original = VRM.getOriginal(edit.getReg()); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index a5cd9fc7a5360..f86a3ea8dc55e 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2664,7 +2664,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg, NamedRegionTimer T("spill", "Spiller", TimerGroupName, TimerGroupDescription, TimePassesIsEnabled); LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats); - spiller().spill(LRE); + spiller().spill(LRE, &Order); ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); // Tell LiveDebugVariables about the new ranges. Ranges not being covered by @@ -2909,7 +2909,8 @@ bool RAGreedy::run(MachineFunction &mf) { VRAI = std::make_unique(*MF, *LIS, *VRM, *Loops, *MBFI); SpillerInstance.reset( - createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI)); + createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI, + Matrix)); VRAI->calculateSpillWeightsAndHints(); diff --git a/llvm/test/CodeGen/X86/avx-cmp.ll b/llvm/test/CodeGen/X86/avx-cmp.ll index d31107bfeb7bb..3ced4f71bad8c 100644 --- a/llvm/test/CodeGen/X86/avx-cmp.ll +++ b/llvm/test/CodeGen/X86/avx-cmp.ll @@ -43,7 +43,8 @@ define void @render(double %a0) nounwind { ; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero -; CHECK-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB2_4 ; CHECK-NEXT: jnp .LBB2_2 ; CHECK-NEXT: .LBB2_4: # %if.then diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll index 95a7a10d50f59..3243d950740ca 100644 --- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll +++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll @@ -111,7 +111,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) { ; AVX512-NEXT: callq use.v4.i32@PLT ; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpcmpltud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 -; AVX512-NEXT: vmovdqa32 {{.*#+}} xmm0 {%k1} {z} = [4294967295,4294967295,4294967295,4294967295] +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: addq $24, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 67c9e7cc22236..59a61722927de 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -195,7 +195,8 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; CHECK-SSE-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; CHECK-SSE-NEXT: callq __truncsfhf2@PLT ; CHECK-SSE-NEXT: callq __extendhfsf2@PLT diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll index 536a1ae3b918d..7f6d64c21724a 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -567,7 +567,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax @@ -581,7 +582,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax @@ -593,7 +595,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax @@ -609,7 +612,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax @@ -621,7 +625,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax @@ -634,7 +639,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax @@ -646,7 +652,8 @@ define <8 x i1> @test_signed_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbl %ebp, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmoval %ebx, %eax ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ebx, %eax diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll index 4305886168abe..ffbdd66529f5c 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -280,11 +280,12 @@ define <4 x i128> @test_unsigned_v4i128_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rax, %r12 ; CHECK-NEXT: movq %rdx, %r13 -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: cmovbq %r14, %r13 ; CHECK-NEXT: cmovbq %r14, %r12 -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovaq %rbp, %r12 ; CHECK-NEXT: cmovaq %rbp, %r13 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -293,19 +294,21 @@ define <4 x i128> @test_unsigned_v4i128_v4f32(<4 x float> %f) nounwind { ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rax, %rbp ; CHECK-NEXT: movq %rdx, %r14 -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: movl $0, %eax ; CHECK-NEXT: cmovbq %rax, %r14 ; CHECK-NEXT: cmovbq %rax, %rbp -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %rbp ; CHECK-NEXT: cmovaq %rax, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: movl $0, %ecx ; CHECK-NEXT: cmovbq %rcx, %rdx ; CHECK-NEXT: cmovbq %rcx, %rax @@ -506,7 +509,8 @@ define <2 x i128> @test_unsigned_v2i128_v2f64(<2 x double> %f) nounwind { ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixunsdfti@PLT ; CHECK-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %r12, %rdx ; CHECK-NEXT: cmovbq %r12, %rax ; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -562,7 +566,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -574,7 +579,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -584,7 +590,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -598,7 +605,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -608,7 +616,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -619,7 +628,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -629,7 +639,8 @@ define <8 x i1> @test_unsigned_v8i1_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -673,7 +684,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -682,7 +694,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -691,7 +704,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -702,7 +716,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -713,7 +728,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -726,7 +742,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %r14d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %r14d ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %r14d @@ -735,7 +752,8 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -777,7 +795,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -789,7 +808,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -799,7 +819,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -813,7 +834,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -823,7 +845,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -834,7 +857,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -844,7 +868,8 @@ define <8 x i16> @test_unsigned_v8i16_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -886,7 +911,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -897,7 +923,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -907,7 +934,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -921,7 +949,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -931,7 +960,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -943,7 +973,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -953,7 +984,8 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbl %ebx, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %ebp, %eax @@ -1007,7 +1039,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1026,7 +1059,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1043,7 +1077,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1062,7 +1097,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1079,7 +1115,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1098,7 +1135,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1115,7 +1153,8 @@ define <8 x i64> @test_unsigned_v8i64_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: sarq $63, %rdx ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: ucomiss %xmm1, %xmm0 ; CHECK-NEXT: cmovbq %rbx, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovaq %r14, %rdx @@ -1167,12 +1206,13 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: cmovbq %r12, %rdx ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx @@ -1182,12 +1222,13 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: cmovbq %r12, %rdx ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx @@ -1197,12 +1238,13 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: cmovbq %r12, %rdx ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx @@ -1213,12 +1255,13 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rdx, %rbp -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: cmovbq %r12, %rbp ; CHECK-NEXT: cmovbq %r12, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovaq %r13, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rbp @@ -1229,12 +1272,13 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: movq %rdx, %r15 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: cmovbq %r12, %r15 ; CHECK-NEXT: cmovbq %r12, %r14 -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: cmovaq %r13, %r14 ; CHECK-NEXT: cmovaq %r13, %r15 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1244,13 +1288,14 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rax, %r12 ; CHECK-NEXT: movq %rdx, %r13 -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: movl $0, %eax ; CHECK-NEXT: cmovbq %rax, %r13 ; CHECK-NEXT: cmovbq %rax, %r12 -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movq $-1, %rax ; CHECK-NEXT: cmovaq %rax, %r12 ; CHECK-NEXT: cmovaq %rax, %r13 @@ -1258,13 +1303,14 @@ define <8 x i128> @test_unsigned_v8i128_v8f16(<8 x half> %f) nounwind { ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm0, %xmm1 ; CHECK-NEXT: movl $0, %ecx ; CHECK-NEXT: cmovbq %rcx, %rdx ; CHECK-NEXT: cmovbq %rcx, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: cmovaq %rcx, %rdx diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll index a6e1275875dbc..a3ef81daef337 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll @@ -34,7 +34,8 @@ define double @mmx_zero(double, double, double, double) nounwind { ; X86-NEXT: paddw %mm2, %mm0 ; X86-NEXT: paddw %mm6, %mm0 ; X86-NEXT: pmuludq %mm3, %mm0 -; X86-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 +; X86-NEXT: pxor %mm3, %mm3 +; X86-NEXT: paddw %mm3, %mm0 ; X86-NEXT: paddw %mm1, %mm0 ; X86-NEXT: pmuludq %mm7, %mm0 ; X86-NEXT: pmuludq (%esp), %mm0 # 8-byte Folded Reload @@ -72,7 +73,8 @@ define double @mmx_zero(double, double, double, double) nounwind { ; X64-NEXT: paddw %mm2, %mm0 ; X64-NEXT: paddw %mm6, %mm0 ; X64-NEXT: pmuludq %mm3, %mm0 -; X64-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 +; X64-NEXT: pxor %mm3, %mm3 +; X64-NEXT: paddw %mm3, %mm0 ; X64-NEXT: paddw %mm1, %mm0 ; X64-NEXT: pmuludq %mm7, %mm0 ; X64-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 5ab09194c5b83..86c932a5bb1f9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -9870,7 +9870,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm4 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm0[7] ; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm4 @@ -9905,7 +9906,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: # xmm5 = mem[0,0] ; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm9 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7] +; AVX-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm9[7] ; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] ; AVX-NEXT: # xmm9 = mem[0,0] ; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm10 @@ -9939,7 +9941,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7] +; AVX-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 From 1c676bde17147b0f8ab57aed6cf1b47c0243d7d4 Mon Sep 17 00:00:00 2001 From: Guozhi Wei Date: Tue, 8 Apr 2025 22:26:23 +0000 Subject: [PATCH 2/2] Reformat. --- llvm/lib/CodeGen/RegAllocGreedy.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index f86a3ea8dc55e..fc5551eca8f62 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2908,9 +2908,8 @@ bool RAGreedy::run(MachineFunction &mf) { PriorityAdvisor = PriorityProvider->getAdvisor(*MF, *this, *Indexes); VRAI = std::make_unique(*MF, *LIS, *VRM, *Loops, *MBFI); - SpillerInstance.reset( - createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, *VRM, *VRAI, - Matrix)); + SpillerInstance.reset(createInlineSpiller({*LIS, *LSS, *DomTree, *MBFI}, *MF, + *VRM, *VRAI, Matrix)); VRAI->calculateSpillWeightsAndHints();