Skip to content

Commit 6292a14

Browse files
committed
[X86] Teach the DAGCombiner how to fold a OR of two shufflevector nodes.
This patch teaches the DAGCombiner how to fold a binary OR between two shufflevector into a single shuffle vector when possible. The rules are: 1. fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1) 2. fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2) The DAGCombiner can take advantage of the fact that OR is commutative and compute two possible shuffle masks (Mask1 and Mask2) for the resulting shuffle node. Before folding a dag according to either rule 1 or 2, DAGCombiner verifies that the resulting shuffle mask is legal for the target. DAGCombiner would firstly try to fold according to 1.; If not possible then it will try to fold according to 2. If both Mask1 and Mask2 are illegal then we conservatively don't fold the OR instruction. llvm-svn: 203156
1 parent 39a0965 commit 6292a14

File tree

2 files changed

+321
-0
lines changed

2 files changed

+321
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3200,6 +3200,60 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
32003200
return N0;
32013201
if (ISD::isBuildVectorAllOnes(N1.getNode()))
32023202
return N1;
3203+
3204+
// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
3205+
// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)
3206+
// Do this only if the resulting shuffle is legal.
3207+
if (isa<ShuffleVectorSDNode>(N0) &&
3208+
isa<ShuffleVectorSDNode>(N1) &&
3209+
N0->getOperand(1) == N1->getOperand(1) &&
3210+
ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) {
3211+
bool CanFold = true;
3212+
unsigned NumElts = VT.getVectorNumElements();
3213+
const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
3214+
const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
3215+
// We construct two shuffle masks:
3216+
// - Mask1 is a shuffle mask for a shuffle with N0 as the first operand
3217+
// and N1 as the second operand.
3218+
// - Mask2 is a shuffle mask for a shuffle with N1 as the first operand
3219+
// and N0 as the second operand.
3220+
// We do this because OR is commutable and therefore there might be
3221+
// two ways to fold this node into a shuffle.
3222+
SmallVector<int,4> Mask1;
3223+
SmallVector<int,4> Mask2;
3224+
3225+
for (unsigned i = 0; i != NumElts && CanFold; ++i) {
3226+
int M0 = SV0->getMaskElt(i);
3227+
int M1 = SV1->getMaskElt(i);
3228+
3229+
// Both shuffle indexes are undef. Propagate Undef.
3230+
if (M0 < 0 && M1 < 0) {
3231+
Mask1.push_back(M0);
3232+
Mask2.push_back(M0);
3233+
continue;
3234+
}
3235+
3236+
if (M0 < 0 || M1 < 0 ||
3237+
(M0 < (int)NumElts && M1 < (int)NumElts) ||
3238+
(M0 >= (int)NumElts && M1 >= (int)NumElts)) {
3239+
CanFold = false;
3240+
break;
3241+
}
3242+
3243+
Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts);
3244+
Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts);
3245+
}
3246+
3247+
if (CanFold) {
3248+
// Fold this sequence only if the resulting shuffle is 'legal'.
3249+
if (TLI.isShuffleMaskLegal(Mask1, VT))
3250+
return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0),
3251+
N1->getOperand(0), &Mask1[0]);
3252+
if (TLI.isShuffleMaskLegal(Mask2, VT))
3253+
return DAG.getVectorShuffle(VT, SDLoc(N), N1->getOperand(0),
3254+
N0->getOperand(0), &Mask2[0]);
3255+
}
3256+
}
32033257
}
32043258

32053259
// fold (or x, undef) -> -1

llvm/test/CodeGen/X86/combine-or.ll

Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
2+
3+
4+
; Verify that each of the following test cases is folded into a single
5+
; instruction which performs a blend operation.
6+
7+
define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
8+
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
9+
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
10+
%or = or <2 x i64> %shuf1, %shuf2
11+
ret <2 x i64> %or
12+
}
13+
; CHECK-LABEL: test1
14+
; CHECK-NOT: xorps
15+
; CHECK: movsd
16+
; CHECK-NOT: orps
17+
; CHECK: ret
18+
19+
20+
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
21+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
22+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
23+
%or = or <4 x i32> %shuf1, %shuf2
24+
ret <4 x i32> %or
25+
}
26+
; CHECK-LABEL: test2
27+
; CHECK-NOT: xorps
28+
; CHECK: shufps
29+
; CHECK: ret
30+
31+
32+
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
33+
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
34+
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
35+
%or = or <2 x i64> %shuf1, %shuf2
36+
ret <2 x i64> %or
37+
}
38+
; CHECK-LABEL: test3
39+
; CHECK-NOT: xorps
40+
; CHECK: movsd
41+
; CHECK-NEXT: ret
42+
43+
44+
define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
45+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
46+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
47+
%or = or <4 x i32> %shuf1, %shuf2
48+
ret <4 x i32> %or
49+
}
50+
; CHECK-LABEL: test4
51+
; CHECK-NOT: xorps
52+
; CHECK: movss
53+
; CHECK-NOT: orps
54+
; CHECK: ret
55+
56+
57+
define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
58+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
59+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
60+
%or = or <4 x i32> %shuf1, %shuf2
61+
ret <4 x i32> %or
62+
}
63+
; CHECK-LABEL: test5
64+
; CHECK-NOT: xorps
65+
; CHECK: movss
66+
; CHECK-NEXT: ret
67+
68+
69+
define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
70+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
71+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
72+
%or = or <4 x i32> %shuf1, %shuf2
73+
ret <4 x i32> %or
74+
}
75+
; CHECK-LABEL: test6
76+
; CHECK-NOT: xorps
77+
; CHECK: shufps
78+
; CHECK-NEXT: ret
79+
80+
81+
define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
82+
%and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
83+
%and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
84+
%or = or <4 x i32> %and1, %and2
85+
ret <4 x i32> %or
86+
}
87+
; CHECK-LABEL: test7
88+
; CHECK-NOT: xorps
89+
; CHECK: shufps
90+
; CHECK-NEXT: ret
91+
92+
93+
define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
94+
%and1 = and <2 x i64> %a, <i64 -1, i64 0>
95+
%and2 = and <2 x i64> %b, <i64 0, i64 -1>
96+
%or = or <2 x i64> %and1, %and2
97+
ret <2 x i64> %or
98+
}
99+
; CHECK-LABEL: test8
100+
; CHECK-NOT: xorps
101+
; CHECK: movsd
102+
; CHECK-NOT: orps
103+
; CHECK: ret
104+
105+
106+
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
107+
%and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
108+
%and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
109+
%or = or <4 x i32> %and1, %and2
110+
ret <4 x i32> %or
111+
}
112+
; CHECK-LABEL: test9
113+
; CHECK-NOT: xorps
114+
; CHECK: shufps
115+
; CHECK: ret
116+
117+
118+
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
119+
%and1 = and <2 x i64> %a, <i64 0, i64 -1>
120+
%and2 = and <2 x i64> %b, <i64 -1, i64 0>
121+
%or = or <2 x i64> %and1, %and2
122+
ret <2 x i64> %or
123+
}
124+
; CHECK-LABEL: test10
125+
; CHECK-NOT: xorps
126+
; CHECK: movsd
127+
; CHECK-NEXT: ret
128+
129+
130+
define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
131+
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
132+
%and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
133+
%or = or <4 x i32> %and1, %and2
134+
ret <4 x i32> %or
135+
}
136+
; CHECK-LABEL: test11
137+
; CHECK-NOT: xorps
138+
; CHECK: movss
139+
; CHECK-NOT: orps
140+
; CHECK: ret
141+
142+
143+
define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
144+
%and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
145+
%and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
146+
%or = or <4 x i32> %and1, %and2
147+
ret <4 x i32> %or
148+
}
149+
; CHECK-LABEL: test12
150+
; CHECK-NOT: xorps
151+
; CHECK: movss
152+
; CHECK-NEXT: ret
153+
154+
155+
; Verify that the following test cases are folded into single shuffles.
156+
157+
define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
158+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
159+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
160+
%or = or <4 x i32> %shuf1, %shuf2
161+
ret <4 x i32> %or
162+
}
163+
; CHECK-LABEL: test13
164+
; CHECK-NOT: xorps
165+
; CHECK: shufps
166+
; CHECK-NEXT: ret
167+
168+
169+
define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
170+
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
171+
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
172+
%or = or <2 x i64> %shuf1, %shuf2
173+
ret <2 x i64> %or
174+
}
175+
; CHECK-LABEL: test14
176+
; CHECK-NOT: pslldq
177+
; CHECK-NOT: por
178+
; CHECK: punpcklqdq
179+
; CHECK-NEXT: ret
180+
181+
182+
define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
183+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
184+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
185+
%or = or <4 x i32> %shuf1, %shuf2
186+
ret <4 x i32> %or
187+
}
188+
; CHECK-LABEL: test15
189+
; CHECK-NOT: xorps
190+
; CHECK: shufps
191+
; CHECK-NOT: shufps
192+
; CHECK-NOT: orps
193+
; CHECK: ret
194+
195+
196+
define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
197+
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
198+
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
199+
%or = or <2 x i64> %shuf1, %shuf2
200+
ret <2 x i64> %or
201+
}
202+
; CHECK-LABEL: test16
203+
; CHECK-NOT: pslldq
204+
; CHECK-NOT: por
205+
; CHECK: punpcklqdq
206+
; CHECK: ret
207+
208+
209+
; Verify that the dag-combiner does not fold a OR of two shuffles into a single
210+
; shuffle instruction when the shuffle indexes are not compatible.
211+
212+
define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
213+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
214+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
215+
%or = or <4 x i32> %shuf1, %shuf2
216+
ret <4 x i32> %or
217+
}
218+
; CHECK-LABEL: test17
219+
; CHECK: por
220+
; CHECK-NEXT: ret
221+
222+
223+
define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
224+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
225+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
226+
%or = or <4 x i32> %shuf1, %shuf2
227+
ret <4 x i32> %or
228+
}
229+
; CHECK-LABEL: test18
230+
; CHECK: orps
231+
; CHECK: ret
232+
233+
234+
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
235+
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
236+
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
237+
%or = or <4 x i32> %shuf1, %shuf2
238+
ret <4 x i32> %or
239+
}
240+
; CHECK-LABEL: test19
241+
; CHECK: por
242+
; CHECK-NEXT: ret
243+
244+
245+
define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
246+
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
247+
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
248+
%or = or <2 x i64> %shuf1, %shuf2
249+
ret <2 x i64> %or
250+
}
251+
; CHECK-LABEL: test20
252+
; CHECK-NOT: xorps
253+
; CHECK: orps
254+
; CHECK-NEXT: ret
255+
256+
257+
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
258+
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
259+
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
260+
%or = or <2 x i64> %shuf1, %shuf2
261+
ret <2 x i64> %or
262+
}
263+
; CHECK-LABEL: test21
264+
; CHECK: por
265+
; CHECK-NEXT: ret
266+
267+

0 commit comments

Comments
 (0)