1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

AMDGPU: Improve <2 x i24> arguments and return value handling

This was asserting for GlobalISel. For SelectionDAG, this was
passing this on the stack. Instead, scalarize this as if it were a
32-bit vector.
This commit is contained in:
Matt Arsenault 2020-08-30 17:28:48 -04:00 committed by Matt Arsenault
parent 6aba538d9d
commit f2aa3ef913
6 changed files with 322 additions and 125 deletions

View File

@ -921,15 +921,18 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
if (VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32)
return ScalarVT.getSimpleVT();
if (Size == 16) {
if (Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
return VT.isInteger() ? MVT::i32 : MVT::f32;
}
if (Size > 32)
return MVT::i32;
if (Size < 16)
return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
}
if (Size == 16 && Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
} else if (VT.getSizeInBits() > 32)
if (VT.getSizeInBits() > 32)
return MVT::i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@ -946,14 +949,15 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32)
// FIXME: Should probably promote 8-bit vectors to i16.
if (Size == 16 && Subtarget->has16BitInsts())
return (NumElts + 1) / 2;
if (Size <= 32)
return NumElts;
if (Size > 32)
return NumElts * ((Size + 31) / 32);
if (Size == 16 && Subtarget->has16BitInsts())
return (NumElts + 1) / 2;
} else if (VT.getSizeInBits() > 32)
return (VT.getSizeInBits() + 31) / 32;
@ -968,6 +972,16 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
if (Size == 16 && Subtarget->has16BitInsts()) {
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
IntermediateVT = RegisterVT;
NumIntermediates = (NumElts + 1) / 2;
return NumIntermediates;
}
if (Size == 32) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
@ -975,22 +989,28 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
if (Size < 16 && Subtarget->has16BitInsts()) {
// FIXME: Should probably form v2i16 pieces
RegisterVT = MVT::i16;
IntermediateVT = ScalarVT;
NumIntermediates = NumElts;
return NumIntermediates;
}
if (Size != 16 && Size <= 32) {
RegisterVT = MVT::i32;
IntermediateVT = ScalarVT;
NumIntermediates = NumElts;
return NumIntermediates;
}
if (Size > 32) {
RegisterVT = MVT::i32;
IntermediateVT = RegisterVT;
NumIntermediates = NumElts * ((Size + 31) / 32);
return NumIntermediates;
}
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
if (Size == 16 && Subtarget->has16BitInsts()) {
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
IntermediateVT = RegisterVT;
NumIntermediates = (NumElts + 1) / 2;
return NumIntermediates;
}
}
return TargetLowering::getVectorTypeBreakdownForCallingConv(

View File

@ -196,6 +196,89 @@ define half @f16_func_void() #0 {
ret half %val
}
define i24 @i24_func_void() #0 {
; CHECK-LABEL: name: i24_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s24)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0
%val = load i24, i24 addrspace(1)* undef
ret i24 %val
}
define zeroext i24 @i24_zeroext_func_void() #0 {
; CHECK-LABEL: name: i24_zeroext_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s24)
; CHECK: $vgpr0 = COPY [[ZEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0
%val = load i24, i24 addrspace(1)* undef
ret i24 %val
}
define signext i24 @i24_signext_func_void() #0 {
; CHECK-LABEL: name: i24_signext_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(s24) = G_LOAD [[DEF]](p1) :: (load 3 from `i24 addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s24)
; CHECK: $vgpr0 = COPY [[SEXT]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0
%val = load i24, i24 addrspace(1)* undef
ret i24 %val
}
define <2 x i24> @v2i24_func_void() #0 {
; CHECK-LABEL: name: v2i24_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s24>) = G_LOAD [[DEF]](p1) :: (load 6 from `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1)
; CHECK: [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<2 x s24>)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
%val = load <2 x i24>, <2 x i24> addrspace(1)* undef
ret <2 x i24> %val
}
define <3 x i24> @v3i24_func_void() #0 {
; CHECK-LABEL: name: v3i24_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s24>) = G_LOAD [[DEF]](p1) :: (load 9 from `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1)
; CHECK: [[UV:%[0-9]+]]:_(s24), [[UV1:%[0-9]+]]:_(s24), [[UV2:%[0-9]+]]:_(s24) = G_UNMERGE_VALUES [[LOAD]](<3 x s24>)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s24)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s24)
; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s24)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32)
; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = load <3 x i24>, <3 x i24> addrspace(1)* undef
ret <3 x i24> %val
}
define i32 @i32_func_void() #0 {
; CHECK-LABEL: name: i32_func_void
; CHECK: bb.1 (%ir-block.0):
@ -977,6 +1060,44 @@ define <16 x i8> @v16i8_func_void() #0 {
ret <16 x i8> %val
}
define <2 x i8> @v2i8_func_void() #0 {
; CHECK-LABEL: name: v2i8_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[DEF]](p1) :: (load 2 from `<2 x i8> addrspace(1)* undef`, addrspace 1)
; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<2 x s8>)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1
%val = load <2 x i8>, <2 x i8> addrspace(1)* undef
ret <2 x i8> %val
}
define <3 x i8> @v3i8_func_void() #0 {
; CHECK-LABEL: name: v3i8_func_void
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[DEF]](p1) :: (load 3 from `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[LOAD]](<3 x s8>)
; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8)
; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8)
; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8)
; CHECK: $vgpr0 = COPY [[ANYEXT]](s32)
; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32)
; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32)
; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]
; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = load <3 x i8>, <3 x i8> addrspace(1)* undef
ret <3 x i8> %val
}
define <4 x i8> @v4i8_func_void() #0 {
; CHECK-LABEL: name: v4i8_func_void
; CHECK: bb.1 (%ir-block.0):

View File

@ -553,6 +553,104 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 {
ret void
}
define void @void_func_v2i24(<2 x i24> %arg0) #0 {
; CHECK-LABEL: name: void_func_v2i24
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[TRUNC:%[0-9]+]]:_(<2 x s24>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>)
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC]](<2 x s24>), [[DEF]](p1) :: (store 6 into `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
; CHECK: S_SETPC_B64_return [[COPY3]]
store <2 x i24> %arg0, <2 x i24> addrspace(1)* undef
ret void
}
define void @void_func_v3i24(<3 x i24> %arg0) #0 {
; CHECK-LABEL: name: void_func_v3i24
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
; CHECK: [[TRUNC:%[0-9]+]]:_(<3 x s24>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC]](<3 x s24>), [[DEF]](p1) :: (store 9 into `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
; CHECK: S_SETPC_B64_return [[COPY4]]
store <3 x i24> %arg0, <3 x i24> addrspace(1)* undef
ret void
}
define void @void_func_v2i8(<2 x i8> %arg0) #0 {
; CHECK-LABEL: name: void_func_v2i8
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
; CHECK: [[TRUNC2:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[BUILD_VECTOR]](<2 x s16>)
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC2]](<2 x s8>), [[DEF]](p1) :: (store 2 into `<2 x i8> addrspace(1)* undef`, addrspace 1)
; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
; CHECK: S_SETPC_B64_return [[COPY3]]
store <2 x i8> %arg0, <2 x i8> addrspace(1)* undef
ret void
}
define void @void_func_v3i8(<3 x i8> %arg0) #0 {
; CHECK-LABEL: name: void_func_v3i8
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
; CHECK: [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16)
; CHECK: [[TRUNC3:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s16>)
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC3]](<3 x s8>), [[DEF]](p1) :: (store 3 into `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1)
; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
; CHECK: S_SETPC_B64_return [[COPY4]]
store <3 x i8> %arg0, <3 x i8> addrspace(1)* undef
ret void
}
define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; CHECK-LABEL: name: void_func_v4i8
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
; CHECK: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s16>)
; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK: G_STORE [[TRUNC4]](<4 x s8>), [[DEF]](p1) :: (store 4 into `<4 x i8> addrspace(1)* undef`, addrspace 1)
; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
; CHECK: S_SETPC_B64_return [[COPY5]]
store <4 x i8> %arg0, <4 x i8> addrspace(1)* undef
ret void
}
define void @void_func_v2p3i8(<2 x i8 addrspace(3)*> %arg0) #0 {
; CHECK-LABEL: name: void_func_v2p3i8
; CHECK: bb.1 (%ir-block.0):

View File

@ -30,6 +30,8 @@ declare <3 x float> @external_v3f32_func_void() #0
declare <5 x float> @external_v5f32_func_void() #0
declare <2 x double> @external_v2f64_func_void() #0
declare <2 x i24> @external_v2i24_func_void() #0
declare <2 x i32> @external_v2i32_func_void() #0
declare <3 x i32> @external_v3i32_func_void() #0
declare <4 x i32> @external_v4i32_func_void() #0
@ -250,6 +252,18 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_v2i24_func_void:
; GCN: s_swappc_b64
; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1
define amdgpu_kernel void @test_call_external_v2i24_func_void() #0 {
%val = call <2 x i24> @external_v2i24_func_void()
%elt0 = extractelement <2 x i24> %val, i32 0
%elt1 = extractelement <2 x i24> %val, i32 1
%add = add i24 %elt0, %elt1
store volatile i24 %add, i24 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_v3f32_func_void:
; GCN: s_swappc
; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2]

View File

@ -981,127 +981,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; SI-LABEL: v_fshr_v2i24:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32
; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16
; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20
; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab
; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0
; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0
; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0
; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_mul_hi_u32 v11, v2, s4
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_mul_hi_u32 v12, v3, s4
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_lshrrev_b32_e32 v11, 4, v11
; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12
; SI-NEXT: v_mul_lo_u32 v11, v11, 24
; SI-NEXT: v_mul_lo_u32 v12, v12, 24
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v12
; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2
; SI-NEXT: v_mul_hi_u32 v6, v4, s4
; SI-NEXT: v_mul_hi_u32 v7, v5, s4
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; SI-NEXT: v_mul_lo_u32 v6, v6, 24
; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7
; SI-NEXT: v_mul_lo_u32 v6, v6, 24
; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4
; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6
; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3
; SI-NEXT: v_alignbit_b32 v1, v1, v6, v2
; SI-NEXT: v_alignbit_b32 v2, v5, v4, v3
; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen
; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen
; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen
; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v2i24:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32
; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab
; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0
; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0
; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0
; VI-NEXT: s_waitcnt vmcnt(4)
; VI-NEXT: v_mul_hi_u32 v11, v2, s4
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_mul_hi_u32 v12, v3, s4
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; VI-NEXT: v_lshrrev_b32_e32 v11, 4, v11
; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12
; VI-NEXT: v_mul_lo_u32 v11, v11, 24
; VI-NEXT: v_mul_lo_u32 v12, v12, 24
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v11
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v12
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2
; VI-NEXT: v_mul_hi_u32 v6, v4, s4
; VI-NEXT: v_mul_hi_u32 v7, v5, s4
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; VI-NEXT: v_mul_lo_u32 v6, v6, 24
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6
; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7
; VI-NEXT: v_mul_lo_u32 v6, v6, 24
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4
; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6
; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3
; VI-NEXT: v_alignbit_b32 v1, v1, v6, v2
; VI-NEXT: v_alignbit_b32 v2, v5, v4, v3
; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen
; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen
; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen
; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshr_v2i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32
; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_mul_hi_u32 v6, v1, s4
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_mul_hi_u32 v7, v2, s4
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4
; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7
; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
; GFX9-NEXT: v_add_u32_e32 v1, 8, v1
; GFX9-NEXT: v_add_u32_e32 v2, 8, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1
; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2
; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX9-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7
; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24
; GFX9-NEXT: v_add_u32_e32 v4, 8, v4
; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6
; GFX9-NEXT: v_add_u32_e32 v3, 8, v3
; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_fshr_v2i24:

View File

@ -344,6 +344,16 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 {
ret void
}
; GCN-LABEL: {{^}}void_func_v2i24:
; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1
define void @void_func_v2i24(<2 x i24> %arg0) #0 {
%elt0 = extractelement <2 x i24> %arg0, i32 0
%elt1 = extractelement <2 x i24> %arg0, i32 1
%add = add i24 %elt0, %elt1
store i24 %add, i24 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2f32:
; GCN-NOT: v[0:1]
; GCN-NOT: v0