Skip to content

Commit 8416529

Browse files
committed
Update Peephole SDWA, undoing some test file changes by previous commit
1 parent f3440db commit 8416529

16 files changed

+255
-279
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,9 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
735735
case AMDGPU::V_ASHRREV_I16_e32:
736736
case AMDGPU::V_LSHLREV_B16_e32:
737737
case AMDGPU::V_LSHRREV_B16_e64:
738+
case AMDGPU::V_LSHRREV_B16_vop3_e64:
738739
case AMDGPU::V_ASHRREV_I16_e64:
740+
case AMDGPU::V_LSHLREV_B16_vop3_e64:
739741
case AMDGPU::V_LSHLREV_B16_e64: {
740742
// from: v_lshrrev_b16_e32 v1, 8, v0
741743
// to SDWA src:v0 src_sel:BYTE_1
@@ -758,11 +760,13 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
758760
break;
759761

760762
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
763+
Opcode == AMDGPU::V_LSHLREV_B16_vop3_e64 ||
761764
Opcode == AMDGPU::V_LSHLREV_B16_e64)
762765
return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
763766
return std::make_unique<SDWASrcOperand>(
764767
Src1, Dst, BYTE_1, false, false,
765768
Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
769+
Opcode != AMDGPU::V_LSHRREV_B16_vop3_e64 &&
766770
Opcode != AMDGPU::V_LSHRREV_B16_e64);
767771
break;
768772
}

llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -864,25 +864,25 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
864864
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865865
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
866866
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2
867-
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0
867+
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
868868
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
869869
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
870870
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
871-
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4
871+
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
872872
; GFX10-NEXT: v_and_b32_e32 v4, 7, v4
873873
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
874874
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
875875
; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
876-
; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
877-
; GFX10-NEXT: v_lshlrev_b16 v4, v4, v6
876+
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
877+
; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
878+
; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5
878879
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
879-
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
880-
; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
881-
; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
882-
; GFX10-NEXT: v_lshrrev_b16 v1, v5, v1
883-
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
880+
; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3
881+
; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
882+
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
883+
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
884884
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
885-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
885+
; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
886886
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
887887
; GFX10-NEXT: s_setpc_b64 s[30:31]
888888
;

llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -864,25 +864,25 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
864864
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865865
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
866866
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
867-
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
867+
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
868868
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
869869
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
870-
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3
870+
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3
871871
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
872872
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
873-
; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6
873+
; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5
874874
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
875-
; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
875+
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
876876
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
877-
; GFX10-NEXT: v_lshrrev_b16 v3, v3, v6
878-
; GFX10-NEXT: v_lshlrev_b16 v4, v5, v4
879-
; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
877+
; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
878+
; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5
879+
; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4
880880
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
881-
; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
882-
; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0
883-
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
881+
; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
882+
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
883+
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
884884
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
885-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
885+
; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
886886
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
887887
; GFX10-NEXT: s_setpc_b64 s[30:31]
888888
;

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
7171
; GFX10: ; %bb.0:
7272
; GFX10-NEXT: v_mov_b32_e32 v0, 0
7373
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74+
; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
7475
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
7576
; GFX10-NEXT: s_waitcnt vmcnt(0)
7677
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
7778
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
7879
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
79-
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
80-
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0
81-
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
80+
; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
81+
; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0
8282
; GFX10-NEXT: v_mov_b32_e32 v0, 0
8383
; GFX10-NEXT: v_mov_b32_e32 v1, 0
84-
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
84+
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
8585
; GFX10-NEXT: global_store_short v[0:1], v2, off
8686
; GFX10-NEXT: s_endpgm
8787
;
@@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
175175
; GFX10: ; %bb.0:
176176
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
177177
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
178+
; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
178179
; GFX10-NEXT: s_waitcnt vmcnt(0)
179180
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
180181
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
181182
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
182-
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
183-
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0
184-
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
183+
; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
184+
; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0
185185
; GFX10-NEXT: v_mov_b32_e32 v0, 0
186186
; GFX10-NEXT: v_mov_b32_e32 v1, 0
187-
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
187+
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
188188
; GFX10-NEXT: global_store_short v[0:1], v2, off
189189
; GFX10-NEXT: s_endpgm
190190
;
@@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
277277
; GFX10: ; %bb.0:
278278
; GFX10-NEXT: v_mov_b32_e32 v1, 0
279279
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
280+
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
280281
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
281282
; GFX10-NEXT: s_waitcnt vmcnt(0)
282283
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
283284
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
284285
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
285-
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
286-
; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
286+
; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
287+
; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
287288
; GFX10-NEXT: v_mov_b32_e32 v0, 0
288289
; GFX10-NEXT: v_mov_b32_e32 v1, 0
289-
; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
290-
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
290+
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
291291
; GFX10-NEXT: global_store_short v[0:1], v2, off
292292
; GFX10-NEXT: s_endpgm
293293
;
@@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
383383
; GFX10: ; %bb.0:
384384
; GFX10-NEXT: v_mov_b32_e32 v1, 0
385385
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
386+
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
386387
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
387388
; GFX10-NEXT: s_waitcnt vmcnt(0)
388389
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
389390
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
390391
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
391-
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
392-
; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo
392+
; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
393+
; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo
393394
; GFX10-NEXT: v_mov_b32_e32 v0, 0
394395
; GFX10-NEXT: v_mov_b32_e32 v1, 0
395-
; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
396-
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
396+
; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
397397
; GFX10-NEXT: global_store_short v[0:1], v2, off
398398
; GFX10-NEXT: s_endpgm
399399
;
@@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
487487
; GFX10: ; %bb.0:
488488
; GFX10-NEXT: v_mov_b32_e32 v2, 0
489489
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
490+
; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
490491
; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
491492
; GFX10-NEXT: s_waitcnt vmcnt(0)
492493
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
493494
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
494495
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
495-
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
496+
; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
496497
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
497498
; GFX10-NEXT: v_mov_b32_e32 v0, 0
498499
; GFX10-NEXT: v_mov_b32_e32 v1, 0
499-
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
500500
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
501501
; GFX10-NEXT: global_store_short v[0:1], v2, off
502502
; GFX10-NEXT: s_endpgm
@@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
590590
; GFX10: ; %bb.0:
591591
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
592592
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
593+
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
593594
; GFX10-NEXT: s_waitcnt vmcnt(0)
594595
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
595596
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
596597
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
597-
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
598+
; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
598599
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
599-
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
600600
; GFX10-NEXT: v_mov_b32_e32 v0, 0
601601
; GFX10-NEXT: v_mov_b32_e32 v1, 0
602602
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
689689
; GFX10: ; %bb.0:
690690
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
691691
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
692+
; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
692693
; GFX10-NEXT: s_waitcnt vmcnt(0)
693694
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
694695
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
695696
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
696-
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
697+
; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
697698
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
698-
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
699699
; GFX10-NEXT: v_mov_b32_e32 v0, 0
700700
; GFX10-NEXT: v_mov_b32_e32 v1, 0
701701
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
788788
; GFX10: ; %bb.0:
789789
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
790790
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
791+
; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
791792
; GFX10-NEXT: s_waitcnt vmcnt(0)
792793
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
793794
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
794795
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
795-
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
796+
; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
796797
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
797-
; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
798798
; GFX10-NEXT: v_mov_b32_e32 v0, 0
799799
; GFX10-NEXT: v_mov_b32_e32 v1, 0
800800
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
315315
; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp
316316
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
317317
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
318-
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
319-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
318+
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
320319
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
321320
; GFX10-NEXT: s_setpc_b64 s[30:31]
322321
;
@@ -459,8 +458,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
459458
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
460459
; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
461460
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
462-
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
463-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
461+
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
464462
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
465463
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
466464
; GFX10-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2579,27 +2579,26 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
25792579
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
25802580
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2
25812581
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2
2582-
; GFX10-NEXT: s_xor_b32 s1, s11, s2
25832582
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
25842583
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3
2584+
; GFX10-NEXT: s_xor_b32 s1, s11, s2
25852585
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
2586-
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
25872586
; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s3, v3
2587+
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
25882588
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
25892589
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
2590-
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
25912590
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
2591+
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
25922592
; GFX10-NEXT: s_xor_b32 s0, s12, s10
2593-
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
2594-
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
2593+
; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
25952594
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
2595+
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
25962596
; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
2597-
; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
2598-
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
2597+
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
25992598
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
2599+
; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
26002600
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
2601-
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
2602-
; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
2601+
; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
26032602
; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
26042603
; GFX10-NEXT: v_mov_b32_e32 v1, 0
26052604
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD

llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -316,8 +316,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
316316
; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
317317
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
318318
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
319-
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
320-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
319+
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
321320
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
322321
; GFX10-NEXT: s_setpc_b64 s[30:31]
323322
;
@@ -460,8 +459,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
460459
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
461460
; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
462461
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
463-
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
464-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
462+
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
465463
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
466464
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
467465
; GFX10-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
315315
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
316316
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
317317
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
318-
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
319-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
318+
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
320319
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
321320
; GFX10-NEXT: s_setpc_b64 s[30:31]
322321
;
@@ -440,8 +439,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
440439
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
441440
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
442441
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
443-
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
444-
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
442+
; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445443
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
446444
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
447445
; GFX10-NEXT: ; return to shader part epilog

0 commit comments

Comments
 (0)