diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1c790f3813b7a..5675d8c20e68c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28,7 +28,6 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ObjCARCUtil.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -29766,6 +29765,113 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, return SDValue(); } +// Given a vector of values, find a permutation such that every adjacent even- +// odd pair has the same value. ~0 is reserved as a special value for wildcard, +// which can be paired with any value. Returns true if a permutation is found. +// If output Permutation is not empty, permutation index starts at its previous +// size, so that this function can concatenate the result of multiple calls. +// UnpairedInputs contains values yet to be paired, mapping an unpaired value to +// its current neighbor's value and index. +// Do not use llvm::DenseMap as ~0 is reserved key. +template , + 8>> +static bool PermuteAndPairVector( + const InputTy &Inputs, PermutationTy &Permutation, + MapTy UnpairedInputs = MapTy()) { + const typename InputTy::value_type Wildcard = ~0; + SmallVector WildcardPairs; + + size_t OutputOffset = Permutation.size(); + typename PermutationTy::value_type I = 0; + for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); + InputIt != InputEnd;) { + Permutation.push_back(OutputOffset + I); + Permutation.push_back(OutputOffset + I + 1); + + auto Even = *InputIt++; + assert(InputIt != InputEnd && "Expected even number of elements"); + auto Odd = *InputIt++; + + // If both are wildcards, note it for later use by unpairable values. + if (Even == Wildcard && Odd == Wildcard) { + WildcardPairs.push_back(I); + } + + // If both are equal, they are in good position. + if (Even != Odd) { + auto DoWork = [&](auto &This, auto ThisIndex, auto Other, + auto OtherIndex) { + if (This != Wildcard) { + // For non-wildcard value, check if it can pair with an exisiting + // unpaired value from UnpairedInputs, if so, swap with the unpaired + // value's neighbor, otherwise the current value is added to the map. + if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace( + This, std::make_pair(Other, OtherIndex)); + !Inserted) { + auto [SwapValue, SwapIndex] = MapIt->second; + std::swap(Permutation[OutputOffset + SwapIndex], + Permutation[OutputOffset + ThisIndex]); + This = SwapValue; + UnpairedInputs.erase(MapIt); + + if (This == Other) { + if (This == Wildcard) { + // We freed up a wildcard pair by pairing two non-adjacent + // values, note it for later use by unpairable values. + WildcardPairs.push_back(I); + } else { + // The swapped element also forms a pair with Other, so it can + // be removed from the map. + assert(UnpairedInputs.count(This)); + UnpairedInputs.erase(This); + } + } else { + // Swapped in an unpaired value, update its info. + if (This != Wildcard) { + assert(UnpairedInputs.count(This)); + UnpairedInputs[This] = std::make_pair(Other, OtherIndex); + } + // If its neighbor is also in UnpairedInputs, update its info too. + if (auto OtherMapIt = UnpairedInputs.find(Other); + OtherMapIt != UnpairedInputs.end() && + OtherMapIt->second.second == ThisIndex) { + OtherMapIt->second.first = This; + } + } + } + } + }; + DoWork(Even, I, Odd, I + 1); + if (Even != Odd) { + DoWork(Odd, I + 1, Even, I); + } + } + I += 2; + } + + // Now check if each remaining unpaired neighboring values can be swapped with + // a wildcard pair to form two paired values. + for (auto &[Unpaired, V] : UnpairedInputs) { + auto [Neighbor, NeighborIndex] = V; + if (Neighbor != Wildcard) { + assert(UnpairedInputs.count(Neighbor)); + if (WildcardPairs.size()) { + std::swap(Permutation[OutputOffset + WildcardPairs.back()], + Permutation[OutputOffset + NeighborIndex]); + WildcardPairs.pop_back(); + // Mark the neighbor as processed. + UnpairedInputs[Neighbor].first = Wildcard; + } else + return false; + } + } + return true; +} + static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -30044,6 +30150,145 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } } + // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant + // amounts can be shuffled such that every pair or quad of adjacent elements + // has the same value. This introduces an extra shuffle before and after the + // shift, and it is profitable if the operand is aready a shuffle so that both + // can be merged or the extra shuffle is fast. + // (shift (shuffle X P1) S1) -> + // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be + // widened, and P2^-1 is the inverse shuffle of P2. + // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector + // variable shift instructions. + // Picking out GFNI because normally it implies AVX512, and there is no + // latency data for CPU with GFNI and SSE or AVX only, but there are tests for + // such combination anyways. + if (ConstantAmt && + (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && + R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() && + !Subtarget.hasXOP() && !Subtarget.hasGFNI()) { + constexpr size_t LaneBytes = 16; + const size_t NumLanes = VT.getVectorNumElements() / LaneBytes; + + SmallVector Permutation; + SmallVector ShiftAmt; + for (size_t I = 0; I < Amt.getNumOperands(); ++I) { + if (Amt.getOperand(I).isUndef()) + ShiftAmt.push_back(~0); + else { + auto A = Amt.getConstantOperandVal(I); + ShiftAmt.push_back(A > 8 ? 8 : A); + } + } + + // Check if we can find an in-lane shuffle to rearrange the shift amounts, + // if so, this transformation may be profitable. Cross-lane shuffle is + // almost never profitable because there is no general 1-instruction + // solution. + bool Profitable; + for (size_t I = 0; I < NumLanes; ++I) { + if (!(Profitable = PermuteAndPairVector( + ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation))) + break; + } + + // For AVX2, check if we can further rearrange shift amounts into adjacent + // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles + // faster. + bool IsAdjacentQuads = false; + if (Profitable && Subtarget.hasAVX2()) { + SmallVector EveryOtherShiftAmt; + for (size_t I = 0; I < Permutation.size(); I += 2) { + uint8_t Shift1 = ShiftAmt[Permutation[I]]; + uint8_t Shift2 = ShiftAmt[Permutation[I + 1]]; + assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 || + Shift2 == (uint8_t) ~0); + EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2); + } + SmallVector Permutation2; + for (size_t I = 0; I < NumLanes; ++I) { + if (!(IsAdjacentQuads = PermuteAndPairVector( + ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], + LaneBytes / 2), + Permutation2))) + break; + } + if (IsAdjacentQuads) { + SmallVector CombinedPermutation; + for (int Index : Permutation2) { + CombinedPermutation.push_back(Permutation[Index * 2]); + CombinedPermutation.push_back(Permutation[Index * 2 + 1]); + } + std::swap(Permutation, CombinedPermutation); + } + } + + // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an + // amount of 0, making it unprofitable. + if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) && + any_of(ShiftAmt, [](uint8_t x) { return x == 0; })) + Profitable = false; + + bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE; + // If operand R is a shuffle, one of the two shuffles introduced by this + // transformation can be merged with it, and the extrast shuffle is 1 cycle. + // This is generally profitable because it eliminates one (or both) vector + // multiplication, which has to be scheduled at least 1 cycle apart. + // If operand R is not a shuffle, several cases are not profitable based on + // pipeline modeling, so we are excluding them here. + if (!IsOperandShuffle) { + // A hack to detect AMD Zen series CPU. + if (Subtarget.hasSSE4A()) { + if (!IsAdjacentQuads) + Profitable = false; + // A hack to detect Zen+ and Zen 2, because VPSRLVD is 2 cycles slower + // than in Zen 3, so this transformation should not be used. + else if (!Subtarget.hasVAES()) + Profitable = false; + } else { + if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) || + (Subtarget.hasAVX2() && !IsAdjacentQuads)) + Profitable = false; + } + } + + // If the shuffle is identity, do not insert it. It also prevents this + // transformation from being applied recursively. + if (llvm::equal(Permutation, llvm::seq(Permutation.size()))) + Profitable = false; + + // Found a permutation P that can rearrange the shift amouts into adjacent + // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))). + if (Profitable) { + SDValue InnerShuffle = + DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation); + SmallVector NewShiftAmt; + for (int Index : Permutation) { + NewShiftAmt.push_back(Amt.getOperand(Index)); + } + // If using (V)PMULHUW, any undef pair is resolved to shift by 8 so that + // it does not create extra instructions in case it is resolved to 0. + for (size_t I = 0; I < NewShiftAmt.size(); I += 2) { + SDValue &Even = NewShiftAmt[I]; + SDValue &Odd = NewShiftAmt[I + 1]; + assert(Even.isUndef() || Odd.isUndef() || + Even->getAsZExtVal() == Odd->getAsZExtVal()); + if (!IsAdjacentQuads && Even.isUndef() && Odd.isUndef()) + Even = DAG.getConstant(8, dl, VT.getScalarType()); + } + + SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt); + SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector); + SmallVector InversePermutation(Permutation.size()); + for (size_t I = 0; I < Permutation.size(); ++I) { + InversePermutation[Permutation[I]] = I; + } + SDValue OuterShuffle = DAG.getVectorShuffle( + VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation); + return OuterShuffle; + } + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts. diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 2b392e69297f0..b14c839a6f1f1 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -351,32 +351,20 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2] -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[9,1,2,7,4,12,11,3,8,0,14,6,5,13,10,15] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,512,2048,4096,256,16384,8192,512] +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: paddb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [16384,32768,8192,4096,256,1024,2048,32768] +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [32,32,64,64,16,16,8,8,u,u,2,2,4,4,64,64] +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: psubb %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[1,2,7,4,12,11,3],zero,xmm2[0,14,6,5,13,10,15] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8: @@ -2184,39 +2172,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-LABEL: non_splat_minus_one_divisor_1: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: psllw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,6,4,5,3,7,12,9,10,11,15,13,14,8] +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,512,256,256,512,512,32768,512] +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: paddb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $1, %xmm3 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: psraw $8, %xmm2 -; SSE41-NEXT: psllw $7, %xmm2 -; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,32768,256,256,32768,32768,512,32768] +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,64,64,u,u,u,u,64,64,64,64,1,1,64,u] +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: psubb %xmm1, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,xmm2[9,10,11,8,13,14,12] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: non_splat_minus_one_divisor_1: @@ -2253,25 +2225,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; ; AVX2-LABEL: non_splat_minus_one_divisor_1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,2,256,256,256,2,256,256,2,2,2,2,128,2,128] -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,256,256,256,128,256,256,128,128,128,128,2,128,2] -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,8,2,6,4,5,3,7,12,9,10,11,15,13,0,1] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64,64,64,1,1,0,0,64,64,64,64,1,1,0,0] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,xmm1[9,10,11,8,13,0,12] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: non_splat_minus_one_divisor_1: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 7903781d63523..b82587a06b580 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2010,13 +2010,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: packuswb %xmm1, %xmm3 ; SSE41-NEXT: paddb %xmm0, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64] -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0] +; SSE41-NEXT: pshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,13,3,11,5,9,7,10,6,12,4,14,2,1,15] +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,16,4,1,2,8,32,64] +; SSE41-NEXT: pshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,14,13,3,11,5,9,7,1,6,8,4,10,2,12,15] ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 19bbf7dc0a0e1..dbf38ec73c6ee 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -262,22 +262,20 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; ; X86-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] -; X86-SSE4-NEXT: psllw $8, %xmm1 -; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [0,4,2,6,1,5,3,7,8,12,10,14,9,13,11,15] +; X86-SSE4-NEXT: pshufb %xmm1, %xmm0 +; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,4,2,8,1,4,2,8] +; X86-SSE4-NEXT: pshufb %xmm1, %xmm0 ; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE4-NEXT: por %xmm1, %xmm0 ; X86-SSE4-NEXT: retl ; ; X64-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8] -; X64-SSE4-NEXT: psllw $8, %xmm1 -; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0] +; X64-SSE4-NEXT: movdqa {{.*#+}} xmm1 = [0,4,2,6,1,5,3,7,8,12,10,14,9,13,11,15] +; X64-SSE4-NEXT: pshufb %xmm1, %xmm0 +; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,2,8,1,4,2,8] +; X64-SSE4-NEXT: pshufb %xmm1, %xmm0 ; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE4-NEXT: por %xmm1, %xmm0 ; X64-SSE4-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: @@ -287,12 +285,11 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw ; ; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8] -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] +; X64-AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index 2ec9de0cb447f..b58ab80f475ed 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -2042,12 +2042,13 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5] +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [128,64,32,16,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i8: @@ -2105,17 +2106,29 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { } define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE-LABEL: constant_shift_v2i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: constant_shift_v2i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256] +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: constant_shift_v2i8: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,256,256,256,256,256,256,8192] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm1 = [32,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: ; AVX1: # %bb.0: @@ -2130,12 +2143,14 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,16,16,0,32,0,16,0,32,0,16,0,32,0,0,0] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index fe349e9ff995d..f6291ea4ae45c 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1744,12 +1744,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256] -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5] +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i8: @@ -1819,13 +1817,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,256,256,256,256,256,256,8192] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: @@ -1840,12 +1836,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256] -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 902bf8a0e55ce..0e20d83d81759 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -1162,12 +1162,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1] -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [14,1,12,3,10,5,8,7,6,9,4,11,2,13,0,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,8,32,128,64,16,4,1] +; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index a44120b6d038c..21b6f301d58c3 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1437,11 +1437,11 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,14,2,12,4,10,6,8,7,9,5,11,3,13,1,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,16,64,128,32,8,2] +; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v8i8: @@ -1526,11 +1526,11 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,u,u,u,u] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,14,2,12,4,5,6,7,8,9,10,11,3,13,1,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,256,256,256,256,8,2] +; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v4i8: @@ -1544,12 +1544,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { ; ; AVX2-LABEL: constant_shift_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5] +; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i8: @@ -1615,11 +1613,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v2i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,u,u,u,u,u,u] +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15] +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,256,256,256,256,256,256,8] +; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: constant_shift_v2i8: @@ -1633,12 +1631,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { ; ; AVX2-LABEL: constant_shift_v2i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [4,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15] +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: constant_shift_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-shift-widen.ll b/llvm/test/CodeGen/X86/vector-shift-widen.ll new file mode 100644 index 0000000000000..ec2441c04cfb9 --- /dev/null +++ b/llvm/test/CodeGen/X86/vector-shift-widen.ll @@ -0,0 +1,290 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,-avx,-avx2 | FileCheck %s --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ZNVER1 +; +; Check the permutation of a variable shift with i8 vector into a widened shift. +; + +; Transform only occurs on SSSE3 because operand is not a shuffle, and shift +; amounts cannot be rearranged to quads. Not checking the correctness of +; untransformed variants here as they are covered by other vector shift checks. +define <16 x i8> @shl_v16i8(<16 x i8> %a) { +; SSSE3-LABEL: shl_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [8,1,2,12,4,5,6,7,0,9,10,11,3,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,256,256,8,256,16,32] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; AVX-LABEL: shl_v16i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: shl_v16i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = shl <16 x i8> %a, + ret <16 x i8> %shift +} + +define <16 x i8> @lshr_v16i8(<16 x i8> %a) { +; SSSE3-LABEL: lshr_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[2,1,4,3,6,5,8,7,10,9,12,11,14,13,0,15] +; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,2048,8192,16384,32768,8192,2048,4096] +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15] +; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: retq +; +; AVX-LABEL: lshr_v16i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: lshr_v16i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = lshr <16 x i8> %a, + ret <16 x i8> %shift +} + +define <16 x i8> @ashr_v16i8(<16 x i8> %a) { +; SSSE3-LABEL: ashr_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,9,11,7,8,13,10,6,1,14,5,15] +; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,8192,512,8192,4096,1024,32768,2048] +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,14,11,7,8,5,10,6,1,9,13,15] +; SSSE3-NEXT: pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [32,64,16,16,1,4,2,16,8,1,u,16,32,8,64,4] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX-LABEL: ashr_v16i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: ashr_v16i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = ashr <16 x i8> %a, + ret <16 x i8> %shift +} + +; Shift amounts cannot be paired. +define <16 x i8> @not_shl_v16i8(<16 x i8> %a) { +; SSSE3-LABEL: not_shl_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NOT: pshufb +; SSSE3-NOT: vpshufb +; SSSE3: retq +; +; AVX-LABEL: not_shl_v16i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: not_shl_v16i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = shl <16 x i8> %a, + ret <16 x i8> %shift +} + +; Right shift amounts containing zero and cannot form quads. +define <16 x i8> @not_lshr_v16i8(<16 x i8> %a) { +; SSSE3-LABEL: not_lshr_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NOT: pshufb +; SSSE3-NOT: vpshufb +; SSSE3: retq +; +; AVX-LABEL: not_lshr_v16i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: not_lshr_v16i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = lshr <16 x i8> %a, + ret <16 x i8> %shift +} + +; Shift cannot form quads and operand is not shuffle, only transform on SSSE3. +define <32 x i8> @shl_v32i8(<32 x i8> %a) { +; SSSE3-LABEL: shl_v32i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # xmm2 = [0,2,1,3,6,5,4,7,8,9,12,11,10,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # xmm3 = [1,4,8,2,16,32,64,16] +; SSSE3-NEXT: pmullw %xmm3, %xmm0 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # xmm4 = [255,252,255,252,254,248,248,254,240,240,192,224,224,192,240,240] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pmullw %xmm3, %xmm1 +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: retq +; +; AVX-LABEL: shl_v32i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: shl_v32i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = shl <32 x i8> %a, + ret <32 x i8> %shift +} + +; For quads only testing on AVX2 as it has vps**vd. +define <32 x i8> @shl_v32i8_quad(<32 x i8> %a) { +; AVX2-LABEL: shl_v32i8_quad: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] +; AVX2-NEXT: vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = shl <32 x i8> %a, + ret <32 x i8> %shift +} + +define <32 x i8> @lshr_v32i8_quad(<32 x i8> %a) { +; AVX2-LABEL: lshr_v32i8_quad: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] +; AVX2-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq + %shift = lshr <32 x i8> %a, + ret <32 x i8> %shift +} + +; Disabling the transform for AMD Zen because it can schedule two vpmullw 2 +; cycles faster compared to Intel. +define <32 x i8> @ashr_v32i8_quad(<32 x i8> %a) { +; AVX2-LABEL: ashr_v32i8_quad: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31] +; AVX2-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31] +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 # ymm1 = [128,32,8,2,8,128,2,32,32,128,8,2,2,128,8,32,64,16,4,1,64,16,4,1,1,4,16,64,1,4,16,64] +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; ZNVER1-LABEL: ashr_v32i8_quad: +; ZNVER1: # %bb.0: +; ZNVER1-NOT: pshufb +; ZNVER1-NOT: vpshufb +; ZNVER1: retq + %shift = ashr <32 x i8> %a, + ret <32 x i8> %shift +} + +; Shift amounts cannot be paired in lane. +define <32 x i8> @not_shl_v32i8(<32 x i8> %a) { +; SSSE3-LABEL: not_shl_v32i8: +; SSSE3: # %bb.0: +; SSSE3-NOT: pshufb +; SSSE3-NOT: vpshufb +; SSSE3: retq +; +; AVX-LABEL: not_shl_v32i8: +; AVX: # %bb.0: +; AVX-NOT: pshufb +; AVX-NOT: vpshufb +; AVX: retq +; +; AVX2-LABEL: not_shl_v32i8: +; AVX2: # %bb.0: +; AVX2-NOT: pshufb +; AVX2-NOT: vpshufb +; AVX2: retq + %shift = shl <32 x i8> %a, + ret <32 x i8> %shift +} + +; Always transform if operand is shuffle and shift amounts can be paired. +define <16 x i8> @lshr_shuffle_v16i8(<16 x i8> %a) { +; SSSE3-LABEL: lshr_shuffle_v16i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] +; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] +; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; SSSE3-NEXT: pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; AVX-LABEL: lshr_shuffle_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] +; AVX-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] +; AVX-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; AVX-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: lshr_shuffle_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] +; AVX2-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] +; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; ZNVER1-LABEL: lshr_shuffle_v16i8: +; ZNVER1: # %bb.0: +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15] +; ZNVER1-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048] +; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ZNVER1-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4] +; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; ZNVER1-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> + %shift = ashr <16 x i8> %shuffle, + ret <16 x i8> %shift +}