diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 9ddd8db1eebf2..2b63cf22f24cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -113,9 +113,63 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, llvm_unreachable("unknown operand type"); } -void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { +// Lower true16 D16 Pseudo instruction to d16_lo/d16_hi MCInst based on +// Dst/Data's .l/.h selection +void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI, + MCInst &OutMI) const { unsigned Opcode = MI->getOpcode(); const auto *TII = static_cast(ST.getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const auto *Info = AMDGPU::getT16D16Helper(Opcode); + + uint16_t OpName = AMDGPU::OpName::OPERAND_LAST; + if (TII->isDS(Opcode)) { + if (MI->mayLoad()) + OpName = llvm::AMDGPU::OpName::vdst; + else if (MI->mayStore()) + OpName = llvm::AMDGPU::OpName::data0; + else + llvm_unreachable("LDS load or store expected"); + } else { + OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata) + ? llvm::AMDGPU::OpName::vdata + : llvm::AMDGPU::OpName::vdst; + } + + // select Dst/Data + int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName); + const MachineOperand &MIVDstOrVData = MI->getOperand(VDstOrVDataIdx); + + // select hi/lo MCInst + bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI); + Opcode = IsHi ? Info->HiOp : Info->LoOp; + + int MCOpcode = TII->pseudoToMCOpcode(Opcode); + assert(MCOpcode != -1 && + "Pseudo instruction doesn't have a target-specific version"); + OutMI.setOpcode(MCOpcode); + + // lower operands + for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) { + const MachineOperand &MO = MI->getOperand(I); + MCOperand MCOp; + if (I == VDstOrVDataIdx) + MCOp = MCOperand::createReg(TRI.get32BitRegister(MIVDstOrVData.getReg())); + else + lowerOperand(MO, MCOp); + OutMI.addOperand(MCOp); + } + + if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) { + MCOperand MCOp; + lowerOperand(MIVDstOrVData, MCOp); + OutMI.addOperand(MCOp); + } +} + +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + unsigned Opcode = MI->getOpcode(); + const auto *TII = static_cast(ST.getInstrInfo()); // FIXME: Should be able to handle this with lowerPseudoInstExpansion. We // need to select it to the subtarget specific version, and there's no way to @@ -136,6 +190,9 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { Opcode == AMDGPU::SI_TCRETURN_GFX) { // TODO: How to use branch immediate and avoid register+add? Opcode = AMDGPU::S_SETPC_B64; + } else if (AMDGPU::getT16D16Helper(Opcode)) { + lowerT16D16Helper(MI, OutMI); + return; } int MCOpcode = TII->pseudoToMCOpcode(Opcode); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h index 7176cc5d3439b..5ddf1ca2ab06d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -39,6 +39,8 @@ class AMDGPUMCInstLower { /// Lower a MachineInstr to an MCInst void lower(const MachineInstr *MI, MCInst &OutMI) const; + + void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const; }; namespace { diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 8fa708b74dde3..ea6e703eba5d9 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -16,6 +16,12 @@ let WantsRoot = true in { def ScratchSVAddr : ComplexPattern; } +class True16D16Table { + Instruction T16Op = !cast(NAME); + Instruction HiOp = !cast(hiOp); + Instruction LoOp = !cast(loOp); +} + //===----------------------------------------------------------------------===// // FLAT classes //===----------------------------------------------------------------------===// @@ -226,6 +232,12 @@ class FLAT_Load_Pseudo { + def "" : FLAT_Load_Pseudo; + let True16Predicate = UseRealTrue16Insts in + def _t16 : FLAT_Load_Pseudo, True16D16Table; +} + class FLAT_Store_Pseudo : FLAT_Pseudo< opName, @@ -662,12 +674,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>; def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; -def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">; def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; -def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">; def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">; } def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; @@ -1049,6 +1061,11 @@ class FlatLoadPat_D16 : (inst $vaddr, $offset, 0, $in) >; +class FlatLoadPat_D16_t16 : GCNPat < + (vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))), + (inst $vaddr, $offset, (i32 0)) +>; + class FlatSignedLoadPat_D16 : GCNPat < (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) @@ -1371,16 +1388,29 @@ def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; -def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; def : FlatLoadPat ; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { + def : FlatLoadPat ; + def : FlatLoadPat ; + def : FlatLoadPat ; + def : FlatLoadPat ; + def : FlatStorePat ; + def : FlatStorePat ; +} + +let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { + def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; + def : FlatLoadPat_D16_t16; +} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts + def : FlatLoadPat ; def : FlatLoadPat ; @@ -2761,3 +2791,11 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_ defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>; defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>; + +def True16D16Table : GenericTable { + let FilterClass = "True16D16Table"; + let CppTypeName = "True16D16Info"; + let Fields = ["T16Op", "HiOp", "LoOp"]; + let PrimaryKey = ["T16Op"]; + let PrimaryKeyName = "getT16D16Helper"; +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bb78e77a9dc1a..49e003516b6e7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2483,8 +2483,15 @@ class getHasExt { + // This type of operands is only used in pseudo instructions helping + // code generation and thus doesn't need encoding and decoding methods. + // It also doesn't need to support AGPRs, because GFX908/A/40 do not + // support True16. + defvar VLdSt_16 = RegisterOperand; + RegisterOperand ret = - !cond(!eq(RC.Size, 32) : AVLdSt_32, + !cond(!eq(RC.Size, 16) : VLdSt_16, + !eq(RC.Size, 32) : AVLdSt_32, !eq(RC.Size, 64) : AVLdSt_64, !eq(RC.Size, 96) : AVLdSt_96, !eq(RC.Size, 128) : AVLdSt_128, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 59afcbed35294..c521d0dd3ad2d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -430,6 +430,7 @@ struct FP4FP8DstByteSelInfo { #define GET_VOPDPairs_IMPL #define GET_VOPTrue16Table_DECL #define GET_VOPTrue16Table_IMPL +#define GET_True16D16Table_IMPL #define GET_WMMAOpcode2AddrMappingTable_DECL #define GET_WMMAOpcode2AddrMappingTable_IMPL #define GET_WMMAOpcode3AddrMappingTable_DECL diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index fad7e67ff3c76..ad0d2198eb636 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -109,6 +109,12 @@ struct CvtScaleF32_F32F16ToF8F4_Info { unsigned Opcode; }; +struct True16D16Info { + unsigned T16Op; + unsigned HiOp; + unsigned LoOp; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -119,6 +125,7 @@ struct CvtScaleF32_F32F16ToF8F4_Info { #define GET_MAIInstInfoTable_DECL #define GET_isMFMA_F8F6F4Table_DECL #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL +#define GET_True16D16Table_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 21a2ae80574e0..db9a89a2a7370 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -3,8 +3,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define <2 x half> @chain_hi_to_lo_private() { ; GFX900-LABEL: chain_hi_to_lo_private: @@ -156,14 +158,23 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) { ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_arithmatic: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %arith_lo = fadd half %in, 1.0 %load_hi = load half, ptr addrspace(5) %base @@ -361,18 +372,31 @@ define <2 x half> @chain_hi_to_lo_flat() { ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_flat: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: flat_load_u16 v0, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2] -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1] +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, ptr null, i64 1 %load_lo = load half, ptr %gep_lo @@ -403,14 +427,23 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_h ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_flat_different_bases: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_u16 v0, v[0:1] -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_different_bases: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_different_bases: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, ptr %base_lo %load_hi = load half, ptr %base_hi @@ -864,17 +897,31 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) { ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: chain_hi_to_lo_flat_other_dep: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep: +; GFX11-TRUE16: ; %bb.0: ; %bb +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep: +; GFX11-FAKE16: ; %bb.0: ; %bb +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1 %load_lo = load volatile i16, ptr addrspace(0) %gep_lo diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 4c68b8d35260f..91f9aa1c5fe3b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -2,8 +2,9 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIVI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10PLUS %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-FAKE16 %s ; GCN-LABEL: {{^}}store_flat_i32: ; GCN-DAG: s_load_{{dwordx2|b64}} s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], @@ -224,7 +225,8 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 { ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}} ; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} -; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}} +; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}} +; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}} define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 { %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095 %val = load volatile i8, ptr %fptr.offset @@ -234,7 +236,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 { ; GCN-LABEL: {{^}}load_flat_i8_max_offset_p1: ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} +; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} +; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} +; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 { %fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096 %val = load volatile i8, ptr %fptr.offset