diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 34ba46f5e6cfd..d3c923a76d074 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4719,6 +4719,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); + + // Without arg data, we need to compute the expanded costs of custom lowered + // intrinsics to prevent use of the (very low) default costs. + if (ICA.isTypeBasedOnly() && + (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) { + Type *CondTy = RetTy->getWithNewBitWidth(1); + InstructionCost Cost = 0; + Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind); + Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind); + Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, + CmpInst::ICMP_EQ, CostKind); + Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, + CmpInst::ICMP_EQ, CostKind); + return Cost; + } } return BaseT::getIntrinsicInstrCost(ICA, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c98d872fb6467..a6674100654db 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9031,9 +9031,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, FastMathFlags FMF; if (auto *FPCI = dyn_cast(CI)) FMF = FPCI->getFastMathFlags(); - SmallVector Arguments(CI->args()); - IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF, - dyn_cast(CI)); + IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF); auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll index 153191b1eea08..3b526c4537243 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -240,16 +240,46 @@ define void @fshl_v16i32() { ; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fshl_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) -; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @fshl_v16i32( +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; AVX1-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; AVX1-NEXT: store <4 x i32> [[TMP3]], ptr @d32, align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; AVX1-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; AVX1-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) +; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @fshl_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; AVX256-LABEL: @fshl_v16i32( +; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX256-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX256-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fshl_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 @@ -333,155 +363,136 @@ define void @fshl_v16i32() { } define void @fshl_v32i16() { -; SSE2-LABEL: @fshl_v32i16( -; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) -; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) -; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) -; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) -; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) -; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) -; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) -; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) -; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) -; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) -; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) -; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) -; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) -; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) -; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) -; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) -; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) -; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) -; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) -; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) -; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) -; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) -; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) -; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) -; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) -; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) -; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) -; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) -; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) -; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) -; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) -; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) -; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2 -; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 -; SSE2-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE2-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 -; SSE2-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 -; SSE2-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 -; SSE2-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 -; SSE2-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 -; SSE2-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 -; SSE2-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 -; SSE2-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE2-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 -; SSE2-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 -; SSE2-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 -; SSE2-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 -; SSE2-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 -; SSE2-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 -; SSE2-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 -; SSE2-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE2-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 -; SSE2-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 -; SSE2-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 -; SSE2-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 -; SSE2-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 -; SSE2-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 -; SSE2-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 -; SSE2-NEXT: ret void -; -; SSE4-LABEL: @fshl_v32i16( -; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE4-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE4-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE4-NEXT: store <8 x i16> [[TMP3]], ptr @d16, align 2 -; SSE4-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) -; SSE4-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -; SSE4-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -; SSE4-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE4-NEXT: ret void +; SSE-LABEL: @fshl_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) +; SSE-NEXT: [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) +; SSE-NEXT: [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) +; SSE-NEXT: [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) +; SSE-NEXT: [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) +; SSE-NEXT: [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) +; SSE-NEXT: [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) +; SSE-NEXT: [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) +; SSE-NEXT: [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) +; SSE-NEXT: [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) +; SSE-NEXT: [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) +; SSE-NEXT: [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) +; SSE-NEXT: [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) +; SSE-NEXT: [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) +; SSE-NEXT: [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) +; SSE-NEXT: [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) +; SSE-NEXT: [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) +; SSE-NEXT: [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) +; SSE-NEXT: [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) +; SSE-NEXT: [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) +; SSE-NEXT: [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) +; SSE-NEXT: [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) +; SSE-NEXT: [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) +; SSE-NEXT: [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) +; SSE-NEXT: [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) +; SSE-NEXT: [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) +; SSE-NEXT: [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) +; SSE-NEXT: [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) +; SSE-NEXT: [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) +; SSE-NEXT: [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) +; SSE-NEXT: [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) +; SSE-NEXT: [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) +; SSE-NEXT: store i16 [[R0]], ptr @d16, align 2 +; SSE-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @fshl_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 @@ -944,52 +955,16 @@ define void @fshl_v64i8() { } define void @fshl_v2i32() { -; SSE-LABEL: @fshl_v2i32( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshl_v2i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshl_v2i32( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX2-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshl_v2i32( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX256-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshl_v2i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshl_v2i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshl_v2i32( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 @@ -1011,44 +986,14 @@ define void @fshl_v2i32() { ; PR63980 define void @fshl_v2i32_uniformconst() { -; SSE-LABEL: @fshl_v2i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshl_v2i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshl_v2i32_uniformconst( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshl_v2i32_uniformconst( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX256-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshl_v2i32_uniformconst( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshl_v2i32_uniformconst( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshl_v2i32_uniformconst( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll index 4d50ffad7f8b5..aae540b4b2454 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512 @a64 = common global [8 x i64] zeroinitializer, align 64 @b64 = common global [8 x i64] zeroinitializer, align 64 @@ -240,16 +240,46 @@ define void @fshr_v16i32() { ; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fshr_v16i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 -; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; AVX-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) -; AVX-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @fshr_v16i32( +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4 +; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4 +; AVX1-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; AVX1-NEXT: store <4 x i32> [[TMP3]], ptr @d32, align 4 +; AVX1-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; AVX1-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4 +; AVX1-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; AVX1-NEXT: store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX1-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4 +; AVX1-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) +; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @fshr_v16i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX2-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX2-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX2-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX2-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX2-NEXT: ret void +; +; AVX256-LABEL: @fshr_v16i32( +; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX256-NEXT: store <8 x i32> [[TMP3]], ptr @d32, align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX256-NEXT: store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fshr_v16i32( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4 @@ -333,155 +363,136 @@ define void @fshr_v16i32() { } define void @fshr_v32i16() { -; SSE2-LABEL: @fshr_v32i16( -; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 -; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 -; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 -; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 -; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 -; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 -; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 -; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 -; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 -; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 -; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 -; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 -; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 -; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 -; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 -; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 -; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 -; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 -; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 -; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 -; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 -; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 -; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 -; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 -; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 -; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 -; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 -; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 -; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 -; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 -; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) -; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) -; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) -; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) -; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) -; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) -; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) -; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) -; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) -; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) -; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) -; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) -; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) -; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) -; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) -; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) -; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) -; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) -; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) -; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) -; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) -; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) -; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) -; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) -; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) -; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) -; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) -; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) -; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) -; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) -; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) -; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) -; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2 -; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 -; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 -; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 -; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 -; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 -; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 -; SSE2-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 -; SSE2-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE2-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 -; SSE2-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 -; SSE2-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 -; SSE2-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 -; SSE2-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 -; SSE2-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 -; SSE2-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 -; SSE2-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE2-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 -; SSE2-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 -; SSE2-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 -; SSE2-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 -; SSE2-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 -; SSE2-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 -; SSE2-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 -; SSE2-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE2-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 -; SSE2-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 -; SSE2-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 -; SSE2-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 -; SSE2-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 -; SSE2-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 -; SSE2-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 -; SSE2-NEXT: ret void -; -; SSE4-LABEL: @fshr_v32i16( -; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2 -; SSE4-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2 -; SSE4-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; SSE4-NEXT: store <8 x i16> [[TMP3]], ptr @d16, align 2 -; SSE4-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) -; SSE4-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 -; SSE4-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -; SSE4-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 -; SSE4-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 -; SSE4-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -; SSE4-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 -; SSE4-NEXT: ret void +; SSE-LABEL: @fshr_v32i16( +; SSE-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]]) +; SSE-NEXT: [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]]) +; SSE-NEXT: [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]]) +; SSE-NEXT: [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]]) +; SSE-NEXT: [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]]) +; SSE-NEXT: [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]]) +; SSE-NEXT: [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]]) +; SSE-NEXT: [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]]) +; SSE-NEXT: [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]]) +; SSE-NEXT: [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]]) +; SSE-NEXT: [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]]) +; SSE-NEXT: [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]]) +; SSE-NEXT: [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]]) +; SSE-NEXT: [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]]) +; SSE-NEXT: [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]]) +; SSE-NEXT: [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]]) +; SSE-NEXT: [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]]) +; SSE-NEXT: [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]]) +; SSE-NEXT: [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]]) +; SSE-NEXT: [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]]) +; SSE-NEXT: [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]]) +; SSE-NEXT: [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]]) +; SSE-NEXT: [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]]) +; SSE-NEXT: [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]]) +; SSE-NEXT: [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]]) +; SSE-NEXT: [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]]) +; SSE-NEXT: [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]]) +; SSE-NEXT: [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]]) +; SSE-NEXT: [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]]) +; SSE-NEXT: [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]]) +; SSE-NEXT: [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]]) +; SSE-NEXT: [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]]) +; SSE-NEXT: store i16 [[R0]], ptr @d16, align 2 +; SSE-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2 +; SSE-NEXT: ret void ; ; AVX-LABEL: @fshr_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2 @@ -944,52 +955,16 @@ define void @fshr_v64i8() { } define void @fshr_v2i32() { -; SSE-LABEL: @fshr_v2i32( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshr_v2i32( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX1-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshr_v2i32( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 -; AVX2-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshr_v2i32( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX256-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshr_v2i32( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4 -; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]]) -; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshr_v2i32( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4 +; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]]) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]]) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshr_v2i32( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 @@ -1011,44 +986,14 @@ define void @fshr_v2i32() { ; PR63980 define void @fshr_v2i32_uniformconst() { -; SSE-LABEL: @fshr_v2i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; SSE-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; SSE-NEXT: store i32 [[R0]], ptr @d32, align 4 -; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX1-LABEL: @fshr_v2i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @fshr_v2i32_uniformconst( -; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX2-NEXT: ret void -; -; AVX256-LABEL: @fshr_v2i32_uniformconst( -; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX256-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX256-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX256-NEXT: ret void -; -; AVX512-LABEL: @fshr_v2i32_uniformconst( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 -; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1)) -; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 -; AVX512-NEXT: ret void +; CHECK-LABEL: @fshr_v2i32_uniformconst( +; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) +; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) +; CHECK-NEXT: store i32 [[R0]], ptr @d32, align 4 +; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; ; AVX512VBMI2-LABEL: @fshr_v2i32_uniformconst( ; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4