1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

GlobalISel: Implement fewerElementsVector for G_INSERT_VECTOR_ELT

Add unit tests since AMDGPU will only trigger this for gigantic
vectors, and won't use the annoying odd sized breakdown case.
This commit is contained in:
Matt Arsenault 2020-07-27 22:00:50 -04:00
parent 01ab206194
commit 418515b7d0
6 changed files with 392 additions and 1106 deletions

View File

@ -279,9 +279,9 @@ public:
LegalizeResult fewerElementsVectorBuildVector(MachineInstr &MI,
unsigned TypeIdx,
LLT NarrowTy);
LegalizeResult fewerElementsVectorExtractVectorElt(MachineInstr &MI,
unsigned TypeIdx,
LLT NarrowTy);
LegalizeResult fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
unsigned TypeIdx,
LLT NarrowTy);
LegalizeResult
reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy);

View File

@ -3608,18 +3608,24 @@ LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI,
}
LegalizerHelper::LegalizeResult
LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
unsigned TypeIdx,
LLT NarrowVecTy) {
assert(TypeIdx == 1 && "not a vector type index");
LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
unsigned TypeIdx,
LLT NarrowVecTy) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcVec = MI.getOperand(1).getReg();
Register InsertVal;
bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
if (IsInsert)
InsertVal = MI.getOperand(2).getReg();
Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
// TODO: Handle total scalarization case.
if (!NarrowVecTy.isVector())
return UnableToLegalize;
Register DstReg = MI.getOperand(0).getReg();
Register SrcVec = MI.getOperand(1).getReg();
Register Idx = MI.getOperand(2).getReg();
LLT VecTy = MRI.getType(SrcVec);
// If the index is a constant, we can really break this down as you would
@ -3637,8 +3643,8 @@ LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
// Build a sequence of NarrowTy pieces in VecParts for this operand.
buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
TargetOpcode::G_ANYEXT);
LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
TargetOpcode::G_ANYEXT);
unsigned NewNumElts = NarrowVecTy.getNumElements();
@ -3647,12 +3653,26 @@ LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
auto NewIdx =
MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
if (IsInsert) {
LLT PartTy = MRI.getType(VecParts[PartIdx]);
// Use the adjusted index to insert into one of the subvectors.
auto InsertPart = MIRBuilder.buildInsertVectorElement(
PartTy, VecParts[PartIdx], InsertVal, NewIdx);
VecParts[PartIdx] = InsertPart.getReg(0);
// Recombine the inserted subvector with the others to reform the result
// vector.
buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
} else {
MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
}
MI.eraseFromParent();
return Legalized;
}
// With a variable index, we can't perform the extract in a smaller type, so
// With a variable index, we can't perform the operation in a smaller type, so
// we're forced to expand this.
//
// TODO: We could emit a chain of compare/select to figure out which piece to
@ -3992,7 +4012,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_BUILD_VECTOR:
return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy);
case G_EXTRACT_VECTOR_ELT:
return fewerElementsVectorExtractVectorElt(MI, TypeIdx, NarrowTy);
case G_INSERT_VECTOR_ELT:
return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
case G_LOAD:
case G_STORE:
return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);

View File

@ -1359,7 +1359,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(EltTypeIdx, S32, S64)
.clampScalar(VecTypeIdx, S32, S64)
.clampScalar(IdxTypeIdx, S32, S32)
.clampMaxNumElements(1, S32, 32)
.clampMaxNumElements(VecTypeIdx, S32, 32)
// TODO: Clamp elements for 64-bit vectors?
// It should only be necessary with variable indexes.
// As a last resort, lower to the stack

View File

@ -0,0 +1,137 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, <64 x i32> addrspace(1)* %ptr.out) #0 {
; GCN-LABEL: v_insert_v64i32_37:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: v_lshlrev_b64 v[0:1], 8, v[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_add_co_u32_e32 v8, vcc, v2, v0
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: v_addc_co_u32_e32 v9, vcc, v3, v1, vcc
; GCN-NEXT: s_movk_i32 s0, 0x80
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v8, v2
; GCN-NEXT: s_movk_i32 s0, 0xc0
; GCN-NEXT: v_mov_b32_e32 v65, s1
; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_mov_b32_e32 v64, s0
; GCN-NEXT: s_movk_i32 s0, 0x50
; GCN-NEXT: v_mov_b32_e32 v69, s1
; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v9, v3, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v4, v0
; GCN-NEXT: v_mov_b32_e32 v68, s0
; GCN-NEXT: s_movk_i32 s0, 0x60
; GCN-NEXT: v_mov_b32_e32 v71, s1
; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v5, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v70, s0
; GCN-NEXT: s_movk_i32 s0, 0x70
; GCN-NEXT: v_mov_b32_e32 v73, s1
; GCN-NEXT: v_add_co_u32_e32 v74, vcc, v66, v2
; GCN-NEXT: v_mov_b32_e32 v72, s0
; GCN-NEXT: s_movk_i32 s0, 0x90
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_addc_co_u32_e32 v75, vcc, v67, v3, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_add_co_u32_e32 v76, vcc, v66, v0
; GCN-NEXT: v_addc_co_u32_e32 v77, vcc, v67, v1, vcc
; GCN-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
; GCN-NEXT: global_load_dwordx4 v[0:3], v[12:13], off
; GCN-NEXT: v_add_co_u32_e32 v10, vcc, 64, v8
; GCN-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v9, vcc
; GCN-NEXT: v_add_co_u32_e32 v28, vcc, v8, v64
; GCN-NEXT: v_addc_co_u32_e32 v29, vcc, v9, v65, vcc
; GCN-NEXT: global_load_dwordx4 v[32:35], v[8:9], off
; GCN-NEXT: global_load_dwordx4 v[36:39], v[8:9], off offset:16
; GCN-NEXT: global_load_dwordx4 v[40:43], v[8:9], off offset:32
; GCN-NEXT: global_load_dwordx4 v[44:47], v[8:9], off offset:48
; GCN-NEXT: global_load_dwordx4 v[48:51], v[10:11], off
; GCN-NEXT: global_load_dwordx4 v[52:55], v[10:11], off offset:16
; GCN-NEXT: global_load_dwordx4 v[56:59], v[10:11], off offset:32
; GCN-NEXT: global_load_dwordx4 v[60:63], v[10:11], off offset:48
; GCN-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32
; GCN-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48
; GCN-NEXT: global_load_dwordx4 v[16:19], v[28:29], off
; GCN-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:16
; GCN-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:32
; GCN-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:48
; GCN-NEXT: s_movk_i32 s0, 0xa0
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7
; GCN-NEXT: s_waitcnt vmcnt(14)
; GCN-NEXT: global_store_dwordx4 v[74:75], v[0:3], off
; GCN-NEXT: global_store_dwordx4 v[76:77], v[4:7], off
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v0
; GCN-NEXT: s_movk_i32 s0, 0xb0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v1, vcc
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v2
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v64
; GCN-NEXT: s_movk_i32 s0, 0xd0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v65, vcc
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v2
; GCN-NEXT: s_movk_i32 s0, 0xe0
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v0
; GCN-NEXT: s_movk_i32 s0, 0xf0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v2
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
; GCN-NEXT: s_waitcnt vmcnt(7)
; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v66
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v67, vcc
; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off offset:-48
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off offset:-32
; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off offset:-16
; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v68
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v69, vcc
; GCN-NEXT: global_store_dwordx4 v[66:67], v[32:35], off
; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off
; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v66, v70
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v67, v71, vcc
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v66, v72
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v67, v73, vcc
; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
; GCN-NEXT: global_store_dwordx4 v[2:3], v[60:63], off
; GCN-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %gep.in
%insert = insertelement <64 x i32> %vec, i32 999, i32 37
%gep.out = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.out, i32 %id
store <64 x i32> %insert, <64 x i32> addrspace(1)* %gep.out
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
attributes #1 = { nounwind readnone speculatable willreturn }

File diff suppressed because it is too large Load Diff

View File

@ -3051,4 +3051,85 @@ TEST_F(AArch64GISelMITest, MoreElementsFreeze) {
EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
}
// Test fewer elements of G_INSERT_VECTOR_ELEMENT
TEST_F(AArch64GISelMITest, FewerElementsInsertVectorElt) {
setUp();
if (!TM)
return;
DefineLegalizerInfo(A, {});
LLT P0{LLT::pointer(0, 64)};
LLT S64{LLT::scalar(64)};
LLT S16{LLT::scalar(16)};
LLT V2S16{LLT::vector(2, 16)};
LLT V3S16{LLT::vector(3, 16)};
LLT V8S16{LLT::vector(8, 16)};
auto Ptr0 = B.buildIntToPtr(P0, Copies[0]);
auto VectorV8 = B.buildLoad(V8S16, Ptr0, MachinePointerInfo(), Align(8));
auto Value = B.buildTrunc(S16, Copies[1]);
auto Seven = B.buildConstant(S64, 7);
auto InsertV8Constant7_0 =
B.buildInsertVectorElement(V8S16, VectorV8, Value, Seven);
auto InsertV8Constant7_1 =
B.buildInsertVectorElement(V8S16, VectorV8, Value, Seven);
B.buildStore(InsertV8Constant7_0, Ptr0, MachinePointerInfo(), Align(8),
MachineMemOperand::MOVolatile);
B.buildStore(InsertV8Constant7_1, Ptr0, MachinePointerInfo(), Align(8),
MachineMemOperand::MOVolatile);
AInfo Info(MF->getSubtarget());
DummyGISelObserver Observer;
LegalizerHelper Helper(*MF, Info, Observer, B);
// Perform Legalization
B.setInsertPt(*EntryMBB, InsertV8Constant7_0->getIterator());
// This should index the high element of the 4th piece of an unmerge.
EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
Helper.fewerElementsVector(*InsertV8Constant7_0, 0, V2S16));
// This case requires extracting an intermediate vector type into the target
// v4s16.
B.setInsertPt(*EntryMBB, InsertV8Constant7_1->getIterator());
EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
Helper.fewerElementsVector(*InsertV8Constant7_1, 0, V3S16));
const auto *CheckStr = R"(
CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY
CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY
CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY
CHECK: [[PTR0:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY0]]
CHECK: [[VEC8:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR0]]:_(p0) :: (load 16, align 8)
CHECK: [[INSERT_VAL:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]]
CHECK: [[UNMERGE0:%[0-9]+]]:_(<2 x s16>), [[UNMERGE1:%[0-9]+]]:_(<2 x s16>), [[UNMERGE2:%[0-9]+]]:_(<2 x s16>), [[UNMERGE3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[VEC8]]
CHECK: [[ONE:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
CHECK: [[SUB_INSERT_7:%[0-9]+]]:_(<2 x s16>) = G_INSERT_VECTOR_ELT [[UNMERGE3]]:_, [[INSERT_VAL]]:_(s16), [[ONE]]
CHECK: [[INSERT_V8_7_0:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[UNMERGE0]]:_(<2 x s16>), [[UNMERGE1]]:_(<2 x s16>), [[UNMERGE2]]:_(<2 x s16>), [[SUB_INSERT_7]]:_(<2 x s16>)
CHECK: [[UNMERGE1_0:%[0-9]+]]:_(s16), [[UNMERGE1_1:%[0-9]+]]:_(s16), [[UNMERGE1_2:%[0-9]+]]:_(s16), [[UNMERGE1_3:%[0-9]+]]:_(s16), [[UNMERGE1_4:%[0-9]+]]:_(s16), [[UNMERGE1_5:%[0-9]+]]:_(s16), [[UNMERGE1_6:%[0-9]+]]:_(s16), [[UNMERGE1_7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[VEC8]]:_(<8 x s16>)
CHECK: [[IMPDEF_S16:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
CHECK: [[BUILD0:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UNMERGE1_0]]:_(s16), [[UNMERGE1_1]]:_(s16), [[UNMERGE1_2]]:_(s16)
CHECK: [[BUILD1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UNMERGE1_3]]:_(s16), [[UNMERGE1_4]]:_(s16), [[UNMERGE1_5]]:_(s16)
CHECK: [[BUILD2:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UNMERGE1_6]]:_(s16), [[UNMERGE1_7]]:_(s16), [[IMPDEF_S16]]:_(s16)
CHECK: [[IMPDEF_V3S16:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
CHECK: [[ONE_1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
CHECK: [[SUB_INSERT_7_V3S16:%[0-9]+]]:_(<3 x s16>) = G_INSERT_VECTOR_ELT [[BUILD2]]:_, [[INSERT_VAL]]:_(s16), [[ONE_1]]
CHECK: [[WIDE_CONCAT:%[0-9]+]]:_(<24 x s16>) = G_CONCAT_VECTORS [[BUILD0]]:_(<3 x s16>), [[BUILD1]]:_(<3 x s16>), [[SUB_INSERT_7_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>)
CHECK: [[INSERT_V8_7_1:%[0-9]+]]:_(<8 x s16>) = G_EXTRACT [[WIDE_CONCAT]]:_(<24 x s16>), 0
CHECK: G_STORE [[INSERT_V8_7_0]]
CHECK: G_STORE [[INSERT_V8_7_1]]
)";
// Check
EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
}
} // namespace