mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
AMDGPU: Distribute SGPR->VGPR copies of REG_SEQUENCE
Make the REG_SEQUENCE be a VGPR, and do the register class copy first. llvm-svn: 251855
This commit is contained in:
parent
0eb4964368
commit
6d010fa207
@ -85,18 +85,6 @@ class SIFixSGPRCopies : public MachineFunctionPass {
|
||||
|
||||
private:
|
||||
static char ID;
|
||||
std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
|
||||
getCopyRegClasses(const MachineInstr &Copy,
|
||||
const SIRegisterInfo &TRI,
|
||||
const MachineRegisterInfo &MRI) const;
|
||||
|
||||
bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
|
||||
const TargetRegisterClass *DstRC,
|
||||
const SIRegisterInfo &TRI) const;
|
||||
|
||||
bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
|
||||
const TargetRegisterClass *DstRC,
|
||||
const SIRegisterInfo &TRI) const;
|
||||
|
||||
public:
|
||||
SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
|
||||
@ -134,10 +122,10 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
|
||||
SIFixSGPRCopies::getCopyRegClasses(const MachineInstr &Copy,
|
||||
const SIRegisterInfo &TRI,
|
||||
const MachineRegisterInfo &MRI) const {
|
||||
static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
|
||||
getCopyRegClasses(const MachineInstr &Copy,
|
||||
const SIRegisterInfo &TRI,
|
||||
const MachineRegisterInfo &MRI) {
|
||||
unsigned DstReg = Copy.getOperand(0).getReg();
|
||||
unsigned SrcReg = Copy.getOperand(1).getReg();
|
||||
|
||||
@ -157,18 +145,94 @@ SIFixSGPRCopies::getCopyRegClasses(const MachineInstr &Copy,
|
||||
return std::make_pair(SrcRC, DstRC);
|
||||
}
|
||||
|
||||
bool SIFixSGPRCopies::isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
|
||||
const TargetRegisterClass *DstRC,
|
||||
const SIRegisterInfo &TRI) const {
|
||||
static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
|
||||
const TargetRegisterClass *DstRC,
|
||||
const SIRegisterInfo &TRI) {
|
||||
return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
|
||||
}
|
||||
|
||||
bool SIFixSGPRCopies::isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
|
||||
const TargetRegisterClass *DstRC,
|
||||
const SIRegisterInfo &TRI) const {
|
||||
static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
|
||||
const TargetRegisterClass *DstRC,
|
||||
const SIRegisterInfo &TRI) {
|
||||
return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
|
||||
}
|
||||
|
||||
// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
|
||||
//
|
||||
// SGPRx = ...
|
||||
// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
|
||||
// VGPRz = COPY SGPRy
|
||||
//
|
||||
// ==>
|
||||
//
|
||||
// VGPRx = COPY SGPRx
|
||||
// VGPRz = REG_SEQUENCE VGPRx, sub0
|
||||
//
|
||||
// This exposes immediate folding opportunities when materializing 64-bit
|
||||
// immediates.
|
||||
static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
|
||||
const SIRegisterInfo *TRI,
|
||||
const SIInstrInfo *TII,
|
||||
MachineRegisterInfo &MRI) {
|
||||
assert(MI.isRegSequence());
|
||||
|
||||
unsigned DstReg = MI.getOperand(0).getReg();
|
||||
if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
|
||||
return false;
|
||||
|
||||
if (!MRI.hasOneUse(DstReg))
|
||||
return false;
|
||||
|
||||
MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
|
||||
if (!CopyUse.isCopy())
|
||||
return false;
|
||||
|
||||
const TargetRegisterClass *SrcRC, *DstRC;
|
||||
std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
|
||||
|
||||
if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
|
||||
return false;
|
||||
|
||||
// TODO: Could have multiple extracts?
|
||||
unsigned SubReg = CopyUse.getOperand(1).getSubReg();
|
||||
if (SubReg != AMDGPU::NoSubRegister)
|
||||
return false;
|
||||
|
||||
MRI.setRegClass(DstReg, DstRC);
|
||||
|
||||
// SGPRx = ...
|
||||
// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
|
||||
// VGPRz = COPY SGPRy
|
||||
|
||||
// =>
|
||||
// VGPRx = COPY SGPRx
|
||||
// VGPRz = REG_SEQUENCE VGPRx, sub0
|
||||
|
||||
MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
|
||||
|
||||
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
|
||||
unsigned SrcReg = MI.getOperand(I).getReg();
|
||||
unsigned SrcSubReg = MI.getOperand(I).getReg();
|
||||
|
||||
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
|
||||
assert(TRI->isSGPRClass(SrcRC) &&
|
||||
"Expected SGPR REG_SEQUENCE to only have SGPR inputs");
|
||||
|
||||
SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
|
||||
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
|
||||
|
||||
unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
|
||||
|
||||
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
|
||||
.addOperand(MI.getOperand(I));
|
||||
|
||||
MI.getOperand(I).setReg(TmpReg);
|
||||
}
|
||||
|
||||
CopyUse.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const SIRegisterInfo *TRI =
|
||||
@ -273,8 +337,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
||||
}
|
||||
case AMDGPU::REG_SEQUENCE: {
|
||||
if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
|
||||
!hasVGPROperands(MI, TRI))
|
||||
!hasVGPROperands(MI, TRI)) {
|
||||
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
|
||||
continue;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
|
||||
|
||||
|
@ -3,8 +3,7 @@
|
||||
|
||||
; Use a 64-bit value with lo bits that can be represented as an inline constant
|
||||
; CHECK-LABEL: {{^}}i64_imm_inline_lo:
|
||||
; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
|
||||
; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
|
||||
; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
|
||||
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
|
||||
define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
|
||||
entry:
|
||||
@ -14,8 +13,7 @@ entry:
|
||||
|
||||
; Use a 64-bit value with hi bits that can be represented as an inline constant
|
||||
; CHECK-LABEL: {{^}}i64_imm_inline_hi:
|
||||
; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
|
||||
; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
|
||||
; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
|
||||
; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
|
||||
define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
|
||||
entry:
|
||||
@ -24,10 +22,8 @@ entry:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64:
|
||||
; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
|
||||
; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
|
||||
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
||||
define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
|
||||
store i64 -9223372036854775808, i64 addrspace(1) *%out
|
||||
@ -523,10 +519,8 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
|
||||
|
||||
|
||||
; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64:
|
||||
; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000
|
||||
; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x80000000
|
||||
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
||||
define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
|
||||
store double -0.0, double addrspace(1)* %out
|
||||
@ -606,10 +600,8 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}store_literal_imm_f64:
|
||||
; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000
|
||||
; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}}
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
|
||||
; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
|
||||
; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
||||
define void @store_literal_imm_f64(double addrspace(1)* %out) {
|
||||
store double 4096.0, double addrspace(1)* %out
|
||||
|
@ -68,10 +68,8 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
|
||||
; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8
|
||||
; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b
|
||||
; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
|
||||
; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
|
||||
define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
|
||||
@ -92,10 +90,8 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
|
||||
; SI-DAG: s_mov_b32 [[SLO:s[0-9]+]], 4.0
|
||||
; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b{{$}}
|
||||
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[SLO]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[SHI]]
|
||||
; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
|
||||
; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
|
||||
define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
|
||||
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
|
||||
|
@ -185,8 +185,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
|
||||
; Make sure load width gets reduced to i32 load.
|
||||
; GCN-LABEL: {{^}}s_shl_32_i64:
|
||||
; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
|
||||
; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
|
||||
define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
|
||||
|
@ -190,8 +190,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
|
||||
; Make sure load width gets reduced to i32 load.
|
||||
; GCN-LABEL: {{^}}s_lshr_32_i64:
|
||||
; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
|
||||
; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
|
||||
define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
|
||||
|
@ -245,18 +245,16 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
|
||||
; GCN-LABEL: {{^}}test_s0_s1_k_f64:
|
||||
; GCN-DAG: s_load_dwordx2 [[SGPR0:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SGPR1_SUB0:[0-9]+]]:[[SGPR1_SUB1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
|
||||
; GCN-DAG: s_mov_b32 s[[SK0_SUB1:[0-9]+]], 0x40900000
|
||||
; GCN-DAG: s_mov_b32 s[[SZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[VK0_SUB0:[0-9]+]], s[[SZERO]]
|
||||
; GCN: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], s[[SK0_SUB1]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VK0_SUB1:[0-9]+]], 0x40900000
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0{{$}}
|
||||
|
||||
; GCN-DAG: s_mov_b32 s[[SK1_SUB1:[0-9]+]], 0x40b00000{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB0:[0-9]+]], s[[SZERO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], s[[SK1_SUB1]]
|
||||
; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK0_SUB0]]:[[VK0_SUB1]]{{\]}}
|
||||
; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VK1_SUB0]]:[[VK1_SUB1]]{{\]}}
|
||||
; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
|
||||
|
||||
; Same zero component is re-used for half of each immediate.
|
||||
; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
|
||||
; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK1_SUB1]]{{\]}}
|
||||
|
||||
; GCN: buffer_store_dwordx2 [[RESULT0]]
|
||||
; GCN: buffer_store_dwordx2 [[RESULT1]]
|
||||
|
@ -7,8 +7,7 @@
|
||||
; R600: MEM_RAT_CACHELESS STORE_RAW
|
||||
|
||||
; SI: {{^}}test:
|
||||
; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
|
||||
; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
|
||||
; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
|
||||
; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
|
||||
define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
|
||||
entry:
|
||||
|
Loading…
Reference in New Issue
Block a user