diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a956074e50d86..faea2adf52089 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20819,7 +20819,8 @@ static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + const X86Subtarget &Subtarget, + const SDNodeFlags Flags = SDNodeFlags()) { // Requires SSE2. if (!Subtarget.hasSSE2()) return SDValue(); @@ -20865,7 +20866,8 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, // e.g. Masks, zext_in_reg, etc. // Pre-SSE41 we can only use PACKUSWB. KnownBits Known = DAG.computeKnownBits(In); - if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) { + if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) || + (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) { PackOpcode = X86ISD::PACKUS; return In; } @@ -20884,7 +20886,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, return SDValue(); unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits; - if (MinSignBits < NumSignBits) { + if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) { PackOpcode = X86ISD::PACKSS; return In; } @@ -20906,10 +20908,9 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, /// This function lowers a vector truncation of 'extended sign-bits' or /// 'extended zero-bits' values. /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. -static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, - const SDLoc &DL, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue LowerTruncateVecPackWithSignBits( + MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, + SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) { MVT SrcVT = In.getSimpleValueType(); MVT DstSVT = DstVT.getVectorElementType(); MVT SrcSVT = SrcVT.getVectorElementType(); @@ -20931,8 +20932,8 @@ static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, } unsigned PackOpcode; - if (SDValue Src = - matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget)) + if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, + Subtarget, Flags)) return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget); return SDValue(); @@ -21102,8 +21103,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS. if (!Subtarget.hasAVX512() || (InVT.is512BitVector() && VT.is256BitVector())) - if (SDValue SignPack = - LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG)) + if (SDValue SignPack = LowerTruncateVecPackWithSignBits( + VT, In, DL, Subtarget, DAG, Op->getFlags())) return SignPack; // Pre-AVX512 see if we can make use of PACKSS/PACKUS. @@ -21120,8 +21121,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to // concat from subvectors to use VPTRUNC etc. if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) - if (SDValue SignPack = - LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG)) + if (SDValue SignPack = LowerTruncateVecPackWithSignBits( + VT, In, DL, Subtarget, DAG, Op->getFlags())) return SignPack; // vpmovqb/w/d, vpmovdb/w, vpmovwb @@ -33578,10 +33579,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // See if there are sufficient leading bits to perform a PACKUS/PACKSS. unsigned PackOpcode; - if (SDValue Src = - matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) { - if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src, - dl, DAG, Subtarget)) { + if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, + Subtarget, N->getFlags())) { + if (SDValue Res = + truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) { Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl); Results.push_back(Res); return; diff --git a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll index 32c7e820c967b..2b8eedfbbdc9c 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll @@ -107,50 +107,28 @@ entry: } define <8 x i16> @trunc8i64_8i16_nsw(<8 x i64> %a) { -; SSE2-SSSE3-LABEL: trunc8i64_8i16_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm2 -; SSE2-SSSE3-NEXT: psrad $16, %xmm2 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm0 -; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i64_8i16_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i64_8i16_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc8i64_8i16_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc8i64_8i16_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -179,23 +157,15 @@ define <8 x i16> @trunc8i64_8i16_nuw(<8 x i64> %a) { ; ; SSE41-LABEL: trunc8i64_8i16_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] ; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc8i64_8i16_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -204,9 +174,6 @@ define <8 x i16> @trunc8i64_8i16_nuw(<8 x i64> %a) { ; ; AVX2-LABEL: trunc8i64_8i16_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -225,59 +192,34 @@ entry: } define void @trunc8i64_8i8_nsw(<8 x i64> %a) { -; SSE2-SSSE3-LABEL: trunc8i64_8i8_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, (%rax) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i64_8i8_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i64_8i8_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc8i64_8i8_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc8i64_8i8_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -296,12 +238,7 @@ entry: define void @trunc8i64_8i8_nuw(<8 x i64> %a) { ; SSE2-SSSE3-LABEL: trunc8i64_8i8_nuw: ; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 @@ -310,12 +247,7 @@ define void @trunc8i64_8i8_nuw(<8 x i64> %a) { ; ; SSE41-LABEL: trunc8i64_8i8_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm2, %xmm0 ; SSE41-NEXT: packuswb %xmm0, %xmm0 @@ -324,11 +256,8 @@ define void @trunc8i64_8i8_nuw(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i8_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -339,9 +268,6 @@ define void @trunc8i64_8i8_nuw(<8 x i64> %a) { ; ; AVX2-LABEL: trunc8i64_8i8_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -363,44 +289,22 @@ entry: } define <8 x i16> @trunc8i32_8i16_nsw(<8 x i32> %a) { -; SSE2-LABEL: trunc8i32_8i16_nsw: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i32_8i16_nsw: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i32_8i16_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i32_8i16_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc8i32_8i16_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc8i32_8i16_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -456,15 +360,11 @@ define <8 x i16> @trunc8i32_8i16_nuw(<8 x i32> %a) { ; ; SSE41-LABEL: trunc8i32_8i16_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc8i32_8i16_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper @@ -472,9 +372,8 @@ define <8 x i16> @trunc8i32_8i16_nuw(<8 x i32> %a) { ; ; AVX2-LABEL: trunc8i32_8i16_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -511,33 +410,18 @@ entry: } define void @trunc8i32_8i8_nsw(<8 x i32> %a) { -; SSE2-SSSE3-LABEL: trunc8i32_8i8_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, (%rax) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i32_8i8_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i32_8i8_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc8i32_8i8_nsw: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -545,10 +429,8 @@ define void @trunc8i32_8i8_nsw(<8 x i32> %a) { ; AVX2-LABEL: trunc8i32_8i8_nsw: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -589,9 +471,6 @@ entry: define void @trunc8i32_8i8_nuw(<8 x i32> %a) { ; SSE2-SSSE3-LABEL: trunc8i32_8i8_nuw: ; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: movq %xmm0, (%rax) @@ -599,9 +478,6 @@ define void @trunc8i32_8i8_nuw(<8 x i32> %a) { ; ; SSE41-LABEL: trunc8i32_8i8_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: movq %xmm0, (%rax) @@ -610,10 +486,8 @@ define void @trunc8i32_8i8_nuw(<8 x i32> %a) { ; AVX1-LABEL: trunc8i32_8i8_nuw: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -621,10 +495,8 @@ define void @trunc8i32_8i8_nuw(<8 x i32> %a) { ; AVX2-LABEL: trunc8i32_8i8_nuw: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -663,57 +535,20 @@ entry: } define void @trunc16i32_16i16_nsw(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i16_nsw: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i16_nsw: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i16_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i32_16i16_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i32_16i16_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -721,10 +556,7 @@ define void @trunc16i32_16i16_nsw(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i16_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -773,12 +605,7 @@ define void @trunc16i32_16i16_nuw(<16 x i32> %a) { ; ; SSE41-LABEL: trunc16i32_16i16_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: movdqu %xmm2, (%rax) ; SSE41-NEXT: movdqu %xmm0, (%rax) @@ -786,11 +613,8 @@ define void @trunc16i32_16i16_nuw(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i16_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) @@ -800,9 +624,6 @@ define void @trunc16i32_16i16_nuw(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i16_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) @@ -823,12 +644,7 @@ entry: define void @trunc16i32_16i8_nsw(<16 x i32> %a) { ; SSE2-SSSE3-LABEL: trunc16i32_16i8_nsw: ; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) @@ -836,12 +652,7 @@ define void @trunc16i32_16i8_nsw(<16 x i32> %a) { ; ; SSE41-LABEL: trunc16i32_16i8_nsw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: movdqu %xmm0, (%rax) @@ -849,11 +660,8 @@ define void @trunc16i32_16i8_nsw(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i8_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -863,9 +671,6 @@ define void @trunc16i32_16i8_nsw(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i8_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -888,12 +693,7 @@ entry: define void @trunc16i32_16i8_nuw(<16 x i32> %a) { ; SSE2-SSSE3-LABEL: trunc16i32_16i8_nuw: ; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) @@ -901,12 +701,7 @@ define void @trunc16i32_16i8_nuw(<16 x i32> %a) { ; ; SSE41-LABEL: trunc16i32_16i8_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = [255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packuswb %xmm2, %xmm0 ; SSE41-NEXT: movdqu %xmm0, (%rax) @@ -914,11 +709,8 @@ define void @trunc16i32_16i8_nuw(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i8_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -928,9 +720,6 @@ define void @trunc16i32_16i8_nuw(<16 x i32> %a) { ; ; AVX2-LABEL: trunc16i32_16i8_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -951,38 +740,24 @@ entry: } define void @trunc16i16_16i8_nsw(<16 x i16> %a) { -; SSE2-SSSE3-LABEL: trunc16i16_16i8_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i16_16i8_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i16_16i8_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc16i16_16i8_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1021,27 +796,14 @@ entry: } define void @trunc16i16_16i8_nuw(<16 x i16> %a) { -; SSE2-SSSE3-LABEL: trunc16i16_16i8_nuw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i16_16i8_nuw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc16i16_16i8_nuw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc16i16_16i8_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) @@ -1050,7 +812,6 @@ define void @trunc16i16_16i8_nuw(<16 x i16> %a) { ; ; AVX2-LABEL: trunc16i16_16i8_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqu %xmm0, (%rax) @@ -1091,41 +852,20 @@ entry: } define void @trunc32i16_32i8_nsw(<32 x i16> %a) { -; SSE2-SSSE3-LABEL: trunc32i16_32i8_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc32i16_32i8_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc32i16_32i8_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc32i16_32i8_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1133,10 +873,7 @@ define void @trunc32i16_32i8_nsw(<32 x i16> %a) { ; ; AVX2-LABEL: trunc32i16_32i8_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1180,41 +917,20 @@ entry: } define void @trunc32i16_32i8_nuw(<32 x i16> %a) { -; SSE2-SSSE3-LABEL: trunc32i16_32i8_nuw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSE2-SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc32i16_32i8_nuw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: trunc32i16_32i8_nuw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqu %xmm2, (%rax) +; SSE-NEXT: movdqu %xmm0, (%rax) +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc32i16_32i8_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -1222,10 +938,7 @@ define void @trunc32i16_32i8_nuw(<32 x i16> %a) { ; ; AVX2-LABEL: trunc32i16_32i8_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1369,54 +1082,30 @@ entry: } define <8 x i16> @trunc2x4i64_8i16_nsw(<4 x i64> %a, <4 x i64> %b) { -; SSE2-SSSE3-LABEL: trunc2x4i64_8i16_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm0 -; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm2 -; SSE2-SSSE3-NEXT: psrad $16, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x4i64_8i16_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc2x4i64_8i16_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc2x4i64_8i16_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc2x4i64_8i16_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1476,40 +1165,27 @@ define <8 x i16> @trunc2x4i64_8i16_nuw(<4 x i64> %a, <4 x i64> %b) { ; ; SSE41-LABEL: trunc2x4i64_8i16_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc2x4i64_8i16_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc2x4i64_8i16_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1643,72 +1319,20 @@ entry: } define <8 x i16> @trunc2x4i32_8i16_nsw(<4 x i32> %a, <4 x i32> %b) { -; SSE2-LABEL: trunc2x4i32_8i16_nsw: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc2x4i32_8i16_nsw: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x4i32_8i16_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: trunc2x4i32_8i16_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: trunc2x4i32_8i16_nsw: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc2x4i32_8i16_nsw: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i32_8i16_nsw: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i32_8i16_nsw: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i32_8i16_nsw: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x4i32_8i16_nsw: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %0 = trunc nsw <4 x i32> %a to <4 x i16> %1 = trunc nsw <4 x i32> %b to <4 x i16> @@ -1736,53 +1360,18 @@ define <8 x i16> @trunc2x4i32_8i16_nuw(<4 x i32> %a, <4 x i32> %b) { ; ; SSE41-LABEL: trunc2x4i32_8i16_nuw: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: trunc2x4i32_8i16_nuw: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc2x4i32_8i16_nuw: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i32_8i16_nuw: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i32_8i16_nuw: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i32_8i16_nuw: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x4i32_8i16_nuw: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %0 = trunc nuw <4 x i32> %a to <4 x i16> %1 = trunc nuw <4 x i32> %b to <4 x i16> @@ -1791,48 +1380,25 @@ entry: } define <32 x i8> @trunc2x16i16_32i8_nsw(<16 x i16> %a, <16 x i16> %b) { -; SSE2-SSSE3-LABEL: trunc2x16i16_32i8_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x16i16_32i8_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: packuswb %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: trunc2x16i16_32i8_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc2x16i16_32i8_nsw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc2x16i16_32i8_nsw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1875,37 +1441,17 @@ entry: } define <32 x i8> @trunc2x16i16_32i8_nuw(<16 x i16> %a, <16 x i16> %b) { -; SSE2-SSSE3-LABEL: trunc2x16i16_32i8_nuw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x16i16_32i8_nuw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: packuswb %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: trunc2x16i16_32i8_nuw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc2x16i16_32i8_nuw: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1913,9 +1459,6 @@ define <32 x i8> @trunc2x16i16_32i8_nuw(<16 x i16> %a, <16 x i16> %b) { ; ; AVX2-LABEL: trunc2x16i16_32i8_nuw: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq @@ -1959,70 +1502,20 @@ entry: } define <16 x i8> @trunc2x8i16_16i8_nsw(<8 x i16> %a, <8 x i16> %b) { -; SSE2-SSSE3-LABEL: trunc2x8i16_16i8_nsw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x8i16_16i8_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc2x8i16_16i8_nsw: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc2x8i16_16i8_nsw: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc2x8i16_16i8_nsw: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x8i16_16i8_nsw: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; SSE-LABEL: trunc2x8i16_16i8_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: retq ; -; AVX512BW-LABEL: trunc2x8i16_16i8_nsw: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: trunc2x8i16_16i8_nsw: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX512BWVL-LABEL: trunc2x8i16_16i8_nsw: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x8i16_16i8_nsw: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %0 = trunc nsw <8 x i16> %a to <8 x i8> %1 = trunc nsw <8 x i16> %b to <8 x i8> @@ -2031,70 +1524,20 @@ entry: } define <16 x i8> @trunc2x8i16_16i8_nuw(<8 x i16> %a, <8 x i16> %b) { -; SSE2-SSSE3-LABEL: trunc2x8i16_16i8_nuw: -; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x8i16_16i8_nuw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc2x8i16_16i8_nuw: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc2x8i16_16i8_nuw: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc2x8i16_16i8_nuw: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x8i16_16i8_nuw: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; SSE-LABEL: trunc2x8i16_16i8_nuw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: retq ; -; AVX512BW-LABEL: trunc2x8i16_16i8_nuw: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: trunc2x8i16_16i8_nuw: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX512BWVL-LABEL: trunc2x8i16_16i8_nuw: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x8i16_16i8_nuw: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %0 = trunc nuw <8 x i16> %a to <8 x i8> %1 = trunc nuw <8 x i16> %b to <8 x i8> @@ -2103,54 +1546,23 @@ entry: } define i64 @trunc8i16_i64_nsw(<8 x i16> %inval) { -; SSE2-LABEL: trunc8i16_i64_nsw: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i16_i64_nsw: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i16_i64_nsw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i16_i64_nsw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq ; ; AVX-LABEL: trunc8i16_i64_nsw: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc8i16_i64_nsw: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i16_i64_nsw: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i16_i64_nsw: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i16_i64_nsw: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc8i16_i64_nsw: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq entry: %0 = trunc nsw <8 x i16> %inval to <8 x i8> %1 = bitcast <8 x i8> %0 to i64 @@ -2158,54 +1570,23 @@ entry: } define i64 @trunc8i16_i64_nuw(<8 x i16> %inval) { -; SSE2-LABEL: trunc8i16_i64_nuw: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i16_i64_nuw: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i16_i64_nuw: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq +; SSE-LABEL: trunc8i16_i64_nuw: +; SSE: # %bb.0: # %entry +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: retq ; ; AVX-LABEL: trunc8i16_i64_nuw: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc8i16_i64_nuw: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i16_i64_nuw: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i16_i64_nuw: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i16_i64_nuw: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc8i16_i64_nuw: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: retq entry: %0 = trunc nuw <8 x i16> %inval to <8 x i8> %1 = bitcast <8 x i8> %0 to i64