-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[X86] Use NSW/NUW flags on ISD::TRUNCATE nodes to improve X86 PACKSS/PACKUS lowering #123956
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…PACKUS lowering If the NSW/NUW flags are present, then we can assume the source value is within bounds and saturation will not occur with the PACKSS/PACKUS instructions. Fixes llvm#87485
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the NSW/NUW flags are present, then we can assume the source value is within bounds and saturation will not occur with the PACKSS/PACKUS instructions. Fixes #87485 Patch is 66.13 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123956.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a956074e50d86f..faea2adf520892 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20819,7 +20819,8 @@ static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
SDValue In, const SDLoc &DL,
SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ const SDNodeFlags Flags = SDNodeFlags()) {
// Requires SSE2.
if (!Subtarget.hasSSE2())
return SDValue();
@@ -20865,7 +20866,8 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
// e.g. Masks, zext_in_reg, etc.
// Pre-SSE41 we can only use PACKUSWB.
KnownBits Known = DAG.computeKnownBits(In);
- if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
+ if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
+ (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
PackOpcode = X86ISD::PACKUS;
return In;
}
@@ -20884,7 +20886,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
return SDValue();
unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
- if (MinSignBits < NumSignBits) {
+ if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
PackOpcode = X86ISD::PACKSS;
return In;
}
@@ -20906,10 +20908,9 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
/// This function lowers a vector truncation of 'extended sign-bits' or
/// 'extended zero-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
-static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
- const SDLoc &DL,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue LowerTruncateVecPackWithSignBits(
+ MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
MVT SrcVT = In.getSimpleValueType();
MVT DstSVT = DstVT.getVectorElementType();
MVT SrcSVT = SrcVT.getVectorElementType();
@@ -20931,8 +20932,8 @@ static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
}
unsigned PackOpcode;
- if (SDValue Src =
- matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
+ if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
+ Subtarget, Flags))
return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
return SDValue();
@@ -21102,8 +21103,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
if (!Subtarget.hasAVX512() ||
(InVT.is512BitVector() && VT.is256BitVector()))
- if (SDValue SignPack =
- LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
+ if (SDValue SignPack = LowerTruncateVecPackWithSignBits(
+ VT, In, DL, Subtarget, DAG, Op->getFlags()))
return SignPack;
// Pre-AVX512 see if we can make use of PACKSS/PACKUS.
@@ -21120,8 +21121,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
// concat from subvectors to use VPTRUNC etc.
if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
- if (SDValue SignPack =
- LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
+ if (SDValue SignPack = LowerTruncateVecPackWithSignBits(
+ VT, In, DL, Subtarget, DAG, Op->getFlags()))
return SignPack;
// vpmovqb/w/d, vpmovdb/w, vpmovwb
@@ -33578,10 +33579,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// See if there are sufficient leading bits to perform a PACKUS/PACKSS.
unsigned PackOpcode;
- if (SDValue Src =
- matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
- if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
- dl, DAG, Subtarget)) {
+ if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
+ Subtarget, N->getFlags())) {
+ if (SDValue Res =
+ truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
Results.push_back(Res);
return;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
index 32c7e820c967b0..2b8eedfbbdc9c2 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
@@ -107,50 +107,28 @@ entry:
}
define <8 x i16> @trunc8i64_8i16_nsw(<8 x i64> %a) {
-; SSE2-SSSE3-LABEL: trunc8i64_8i16_nsw:
-; SSE2-SSSE3: # %bb.0: # %entry
-; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE2-SSSE3-NEXT: pslld $16, %xmm2
-; SSE2-SSSE3-NEXT: psrad $16, %xmm2
-; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE2-SSSE3-NEXT: pslld $16, %xmm0
-; SSE2-SSSE3-NEXT: psrad $16, %xmm0
-; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i64_8i16_nsw:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packusdw %xmm2, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc8i64_8i16_nsw:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i16_nsw:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i16_nsw:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -179,23 +157,15 @@ define <8 x i16> @trunc8i64_8i16_nuw(<8 x i64> %a) {
;
; SSE41-LABEL: trunc8i64_8i16_nuw:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i16_nuw:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -204,9 +174,6 @@ define <8 x i16> @trunc8i64_8i16_nuw(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i16_nuw:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -225,59 +192,34 @@ entry:
}
define void @trunc8i64_8i8_nsw(<8 x i64> %a) {
-; SSE2-SSSE3-LABEL: trunc8i64_8i8_nsw:
-; SSE2-SSSE3: # %bb.0: # %entry
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT: movq %xmm0, (%rax)
-; SSE2-SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i64_8i8_nsw:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = [255,255]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packusdw %xmm2, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm0
-; SSE41-NEXT: movq %xmm0, (%rax)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc8i64_8i8_nsw:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: movq %xmm0, (%rax)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i8_nsw:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i8_nsw:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -296,12 +238,7 @@ entry:
define void @trunc8i64_8i8_nuw(<8 x i64> %a) {
; SSE2-SSSE3-LABEL: trunc8i64_8i8_nuw:
; SSE2-SSSE3: # %bb.0: # %entry
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
@@ -310,12 +247,7 @@ define void @trunc8i64_8i8_nuw(<8 x i64> %a) {
;
; SSE41-LABEL: trunc8i64_8i8_nuw:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = [255,255]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: packuswb %xmm0, %xmm0
@@ -324,11 +256,8 @@ define void @trunc8i64_8i8_nuw(<8 x i64> %a) {
;
; AVX1-LABEL: trunc8i64_8i8_nuw:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -339,9 +268,6 @@ define void @trunc8i64_8i8_nuw(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i8_nuw:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
@@ -363,44 +289,22 @@ entry:
}
define <8 x i16> @trunc8i32_8i16_nsw(<8 x i32> %a) {
-; SSE2-LABEL: trunc8i32_8i16_nsw:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i32_8i16_nsw:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i32_8i16_nsw:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc8i32_8i16_nsw:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i32_8i16_nsw:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i32_8i16_nsw:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -456,15 +360,11 @@ define <8 x i16> @trunc8i32_8i16_nuw(<8 x i32> %a) {
;
; SSE41-LABEL: trunc8i32_8i16_nuw:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc8i32_8i16_nuw:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
@@ -472,9 +372,8 @@ define <8 x i16> @trunc8i32_8i16_nuw(<8 x i32> %a) {
;
; AVX2-LABEL: trunc8i32_8i16_nuw:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -511,33 +410,18 @@ entry:
}
define void @trunc8i32_8i8_nsw(<8 x i32> %a) {
-; SSE2-SSSE3-LABEL: trunc8i32_8i8_nsw:
-; SSE2-SSSE3: # %bb.0: # %entry
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT: movq %xmm0, (%rax)
-; SSE2-SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i32_8i8_nsw:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm0
-; SSE41-NEXT: movq %xmm0, (%rax)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc8i32_8i8_nsw:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: movq %xmm0, (%rax)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i32_8i8_nsw:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -545,10 +429,8 @@ define void @trunc8i32_8i8_nsw(<8 x i32> %a) {
; AVX2-LABEL: trunc8i32_8i8_nsw:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -589,9 +471,6 @@ entry:
define void @trunc8i32_8i8_nuw(<8 x i32> %a) {
; SSE2-SSSE3-LABEL: trunc8i32_8i8_nuw:
; SSE2-SSSE3:...
[truncated]
|
@@ -20865,7 +20866,8 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, | |||
// e.g. Masks, zext_in_reg, etc. | |||
// Pre-SSE41 we can only use PACKUSWB. | |||
KnownBits Known = DAG.computeKnownBits(In); | |||
if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) { | |||
if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) || | |||
(NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we take NSW/NUW
into account when countMinLeadingZeros/ComputeNumSignBits
so that other cases may benefit from it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It'd be tricky as its the TRUNC user node that implies the min leading sign/zero count on the source node, not the source node itself. So we'd have to check for users of a node and retroactively adjust the analysis.
I'll do some experiments but I'm not confident
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
First thing I've noticed - as soon as the ISD::TRUNCATE node disappears we lose this extra analysis on the upper bits of the source node, which is acceptable for the x86 PACK nodes, but might not be for other nodes that were relying on access to that information?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense to me. Thanks for the investigation!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
If the NSW/NUW flags are present, then we can assume the source value is within bounds and saturation will not occur with the PACKSS/PACKUS instructions.
Fixes #87485