@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
71
71
; GFX10: ; %bb.0:
72
72
; GFX10-NEXT: v_mov_b32_e32 v0, 0
73
73
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74
+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
74
75
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
75
76
; GFX10-NEXT: s_waitcnt vmcnt(0)
76
77
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
77
78
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
78
79
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
79
- ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
80
- ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0
81
- ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
80
+ ; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
81
+ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0
82
82
; GFX10-NEXT: v_mov_b32_e32 v0, 0
83
83
; GFX10-NEXT: v_mov_b32_e32 v1, 0
84
- ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
84
+ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
85
85
; GFX10-NEXT: global_store_short v[0:1], v2, off
86
86
; GFX10-NEXT: s_endpgm
87
87
;
@@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
175
175
; GFX10: ; %bb.0:
176
176
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
177
177
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
178
+ ; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
178
179
; GFX10-NEXT: s_waitcnt vmcnt(0)
179
180
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
180
181
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
181
182
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
182
- ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
183
- ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0
184
- ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
183
+ ; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
184
+ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0
185
185
; GFX10-NEXT: v_mov_b32_e32 v0, 0
186
186
; GFX10-NEXT: v_mov_b32_e32 v1, 0
187
- ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
187
+ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
188
188
; GFX10-NEXT: global_store_short v[0:1], v2, off
189
189
; GFX10-NEXT: s_endpgm
190
190
;
@@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
277
277
; GFX10: ; %bb.0:
278
278
; GFX10-NEXT: v_mov_b32_e32 v1, 0
279
279
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
280
+ ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
280
281
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
281
282
; GFX10-NEXT: s_waitcnt vmcnt(0)
282
283
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
283
284
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
284
285
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
285
- ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
286
- ; GFX10-NEXT: v_cndmask_b32_e32 v3 , v1, v0, vcc_lo
286
+ ; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
287
+ ; GFX10-NEXT: v_cndmask_b32_e32 v4 , v1, v0, vcc_lo
287
288
; GFX10-NEXT: v_mov_b32_e32 v0, 0
288
289
; GFX10-NEXT: v_mov_b32_e32 v1, 0
289
- ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
290
- ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
290
+ ; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
291
291
; GFX10-NEXT: global_store_short v[0:1], v2, off
292
292
; GFX10-NEXT: s_endpgm
293
293
;
@@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
383
383
; GFX10: ; %bb.0:
384
384
; GFX10-NEXT: v_mov_b32_e32 v1, 0
385
385
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
386
+ ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
386
387
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
387
388
; GFX10-NEXT: s_waitcnt vmcnt(0)
388
389
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
389
390
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
390
391
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
391
- ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
392
- ; GFX10-NEXT: v_cndmask_b32_e64 v3 , v1, s4, vcc_lo
392
+ ; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
393
+ ; GFX10-NEXT: v_cndmask_b32_e64 v4 , v1, s4, vcc_lo
393
394
; GFX10-NEXT: v_mov_b32_e32 v0, 0
394
395
; GFX10-NEXT: v_mov_b32_e32 v1, 0
395
- ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
396
- ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
396
+ ; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
397
397
; GFX10-NEXT: global_store_short v[0:1], v2, off
398
398
; GFX10-NEXT: s_endpgm
399
399
;
@@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
487
487
; GFX10: ; %bb.0:
488
488
; GFX10-NEXT: v_mov_b32_e32 v2, 0
489
489
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
490
+ ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
490
491
; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
491
492
; GFX10-NEXT: s_waitcnt vmcnt(0)
492
493
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
493
494
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
494
495
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
495
- ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
496
+ ; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
496
497
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
497
498
; GFX10-NEXT: v_mov_b32_e32 v0, 0
498
499
; GFX10-NEXT: v_mov_b32_e32 v1, 0
499
- ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
500
500
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
501
501
; GFX10-NEXT: global_store_short v[0:1], v2, off
502
502
; GFX10-NEXT: s_endpgm
@@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
590
590
; GFX10: ; %bb.0:
591
591
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
592
592
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
593
+ ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
593
594
; GFX10-NEXT: s_waitcnt vmcnt(0)
594
595
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
595
596
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
596
597
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
597
- ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
598
+ ; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
598
599
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
599
- ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
600
600
; GFX10-NEXT: v_mov_b32_e32 v0, 0
601
601
; GFX10-NEXT: v_mov_b32_e32 v1, 0
602
602
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
689
689
; GFX10: ; %bb.0:
690
690
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
691
691
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
692
+ ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
692
693
; GFX10-NEXT: s_waitcnt vmcnt(0)
693
694
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
694
695
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
695
696
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
696
- ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
697
+ ; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
697
698
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
698
- ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
699
699
; GFX10-NEXT: v_mov_b32_e32 v0, 0
700
700
; GFX10-NEXT: v_mov_b32_e32 v1, 0
701
701
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
788
788
; GFX10: ; %bb.0:
789
789
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
790
790
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
791
+ ; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
791
792
; GFX10-NEXT: s_waitcnt vmcnt(0)
792
793
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
793
794
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
794
795
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
795
- ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
796
+ ; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
796
797
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
797
- ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
798
798
; GFX10-NEXT: v_mov_b32_e32 v0, 0
799
799
; GFX10-NEXT: v_mov_b32_e32 v1, 0
800
800
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
0 commit comments