1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

AMDGPU: Scalarize vector argument types to calls

When lowering calling conventions, prefer to decompose vectors
into the constitute register types. This avoids artifical constraints
to satisfy a wide super-register.

This improves code quality because now optimizations don't need to
deal with the super-register constraint. For example the immediate
folding code doesn't deal with 4 component reg_sequences, so by
breaking the register down earlier the existing immediate folding
code is able to work.

This also avoids the need for the shader input processing code
to manually split vector types.

llvm-svn: 338416
This commit is contained in:
Matt Arsenault 2018-07-31 19:05:14 +00:00
parent 7b7e97dd13
commit 632df24ca9
4 changed files with 86 additions and 63 deletions

View File

@ -697,10 +697,11 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (CC != CallingConv::AMDGPU_KERNEL &&
VT.isVector() && VT.getVectorNumElements() == 3) {
// TODO: Consider splitting all arguments into 32-bit pieces.
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
if (ScalarVT.getSizeInBits() == 32)
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32 || Size == 64)
return ScalarVT.getSimpleVT();
}
@ -710,11 +711,11 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
if (CC != CallingConv::AMDGPU_KERNEL &&
VT.isVector() && VT.getVectorNumElements() == 3) {
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
if (ScalarVT.getSizeInBits() == 32)
return 3;
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32 || Size == 64)
return VT.getVectorNumElements();
}
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@ -724,14 +725,13 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC,
EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) {
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
if (ScalarVT.getSizeInBits() == 32 ||
ScalarVT.getSizeInBits() == 64) {
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32 || Size == 64) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
NumIntermediates = 3;
NumIntermediates = VT.getVectorNumElements();
return NumIntermediates;
}
}
@ -1314,6 +1314,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
assert(!Arg->VT.isVector() && "vector type argument should have been split");
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
!Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
@ -1347,25 +1349,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
++PSInputNum;
}
// Second split vertices into their elements.
if (Arg->VT.isVector()) {
ISD::InputArg NewArg = *Arg;
NewArg.Flags.setSplit();
NewArg.VT = Arg->VT.getVectorElementType();
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
// NOT four or eight.
Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
for (unsigned J = 0; J != NumElements; ++J) {
Splits.push_back(NewArg);
NewArg.PartOffset += NewArg.VT.getStoreSize();
}
} else {
Splits.push_back(*Arg);
}
Splits.push_back(*Arg);
}
}

View File

@ -54,8 +54,8 @@ entry:
; FUNC-LABEL: {{^}}v_bitselect_v2i32_pat1:
; GCN: s_waitcnt
; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5
; GCN-NEXT: v_bfi_b32 v0, v2, v0, v4
; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5
; GCN-NEXT: s_setpc_b64
define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
%xor.0 = xor <2 x i32> %a, %mask

View File

@ -286,10 +286,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; FIXME: Immedites should fold directly into v_mov_b32s
; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
; GCN: buffer_load_dwordx4 v[0:3]
; GCN: v_mov_b32_e32 v4, s
; GCN: v_mov_b32_e32 v5, s
; GCN: v_mov_b32_e32 v6, s
; GCN: v_mov_b32_e32 v7, s
; GCN-DAG: v_mov_b32_e32 v4, s
; GCN-DAG: v_mov_b32_e32 v5, s
; GCN-DAG: v_mov_b32_e32 v6, s
; GCN-DAG: v_mov_b32_e32 v7, s
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
@ -358,6 +358,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm:
; GCN-DAG: v_mov_b32_e32 v0, 1
; GCN-DAG: v_mov_b32_e32 v1, 2
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
; HSA-DAG: s_mov_b32 s33, s9
; MESA-DAG: s_mov_b32 s33, s3{{$}}
@ -393,6 +402,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm:
; GCN-DAG: v_mov_b32_e32 v0, 1
; GCN-DAG: v_mov_b32_e32 v1, 2
; GCN-DAG: v_mov_b32_e32 v2, 3
; GCN-DAG: v_mov_b32_e32 v3, 4
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
@ -405,6 +425,21 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm:
; GCN-DAG: v_mov_b32_e32 v0, 1
; GCN-DAG: v_mov_b32_e32 v1, 2
; GCN-DAG: v_mov_b32_e32 v2, 3
; GCN-DAG: v_mov_b32_e32 v3, 4
; GCN-DAG: v_mov_b32_e32 v4, 5
; GCN-DAG: v_mov_b32_e32 v5, 6
; GCN-DAG: v_mov_b32_e32 v6, 7
; GCN-DAG: v_mov_b32_e32 v7, 8
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off

View File

@ -54,13 +54,13 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32:
; GFX900: v_mov_b32_e32 v3, v1
; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1]
; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; GFX900-NEXT: v_mov_b32_e32 v1, v3
; GFX906: v_mov_b32_e32 v3, v1
; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1]
; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; GFX906-NEXT: v_mov_b32_e32 v1, v3
; CIVI: v_mac_f32
define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
@ -73,14 +73,14 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle:
; GCN: s_waitcnt
; GFX900-NEXT: v_mov_b32_e32 v3, v1
; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
; GFX900-NEXT: v_mov_b32_e32 v0, v3
; GFX900-NEXT: s_setpc_b64
; GFX906-NEXT: v_mov_b32_e32 v3, v1
; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
; GFX906-NEXT: v_mov_b32_e32 v0, v3
; GFX906-NEXT: s_setpc_b64
; CIVI: v_mac_f32
@ -274,13 +274,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1:
; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_mov_b32_e32 v3, 1.0
; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mov_b32_e32 v1, v2
; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
; GFX906: v_mov_b32_e32 v1, v2
define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@ -289,13 +290,15 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi:
; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_mov_b32_e32 v3, 0x3e230000
; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mov_b32_e32 v1, v2
; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
; GFX906: v_mov_b32_e32 v1, v2
define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
@ -305,14 +308,15 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi:
; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_mov_b32_e32 v3, 0.15915494
; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
; GFX900: v_mov_b32_e32 v1, v2
; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
; GFX906: v_mov_b32_e32 v1, v2
define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>