mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
AMDGPU: Make better use of op_sel with high components
Handle more general swizzles. llvm-svn: 303296
This commit is contained in:
parent
2bb05753f1
commit
13c6f6f3f1
@ -1700,12 +1700,46 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
|
||||
return true;
|
||||
}
|
||||
|
||||
static SDValue stripBitcast(SDValue Val) {
|
||||
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
|
||||
}
|
||||
|
||||
// Figure out if this is really an extract of the high 16-bits of a dword.
|
||||
static bool isExtractHiElt(SDValue In, SDValue &Out) {
|
||||
In = stripBitcast(In);
|
||||
if (In.getOpcode() != ISD::TRUNCATE)
|
||||
return false;
|
||||
|
||||
SDValue Srl = In.getOperand(0);
|
||||
if (Srl.getOpcode() == ISD::SRL) {
|
||||
if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
|
||||
if (ShiftAmt->getZExtValue() == 16) {
|
||||
Out = stripBitcast(Srl.getOperand(0));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look through operations that obscure just looking at the low 16-bits of the
|
||||
// same register.
|
||||
static SDValue stripExtractLoElt(SDValue In) {
|
||||
if (In.getOpcode() == ISD::TRUNCATE) {
|
||||
SDValue Src = In.getOperand(0);
|
||||
if (Src.getValueType().getSizeInBits() == 32)
|
||||
return stripBitcast(Src);
|
||||
}
|
||||
|
||||
return In;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
|
||||
SDValue &SrcMods) const {
|
||||
unsigned Mods = 0;
|
||||
Src = In;
|
||||
|
||||
// FIXME: Look for on separate components
|
||||
if (Src.getOpcode() == ISD::FNEG) {
|
||||
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
|
||||
Src = Src.getOperand(0);
|
||||
@ -1714,19 +1748,28 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
|
||||
if (Src.getOpcode() == ISD::BUILD_VECTOR) {
|
||||
unsigned VecMods = Mods;
|
||||
|
||||
SDValue Lo = Src.getOperand(0);
|
||||
SDValue Hi = Src.getOperand(1);
|
||||
SDValue Lo = stripBitcast(Src.getOperand(0));
|
||||
SDValue Hi = stripBitcast(Src.getOperand(1));
|
||||
|
||||
if (Lo.getOpcode() == ISD::FNEG) {
|
||||
Lo = Lo.getOperand(0);
|
||||
Lo = stripBitcast(Lo.getOperand(0));
|
||||
Mods ^= SISrcMods::NEG;
|
||||
}
|
||||
|
||||
if (Hi.getOpcode() == ISD::FNEG) {
|
||||
Hi = Hi.getOperand(0);
|
||||
Hi = stripBitcast(Hi.getOperand(0));
|
||||
Mods ^= SISrcMods::NEG_HI;
|
||||
}
|
||||
|
||||
if (isExtractHiElt(Lo, Lo))
|
||||
Mods |= SISrcMods::OP_SEL_0;
|
||||
|
||||
if (isExtractHiElt(Hi, Hi))
|
||||
Mods |= SISrcMods::OP_SEL_1;
|
||||
|
||||
Lo = stripExtractLoElt(Lo);
|
||||
Hi = stripExtractLoElt(Hi);
|
||||
|
||||
if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
|
||||
// Really a scalar input. Just select from the low half of the register to
|
||||
// avoid packing.
|
||||
@ -1740,9 +1783,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
|
||||
}
|
||||
|
||||
// Packed instructions do not have abs modifiers.
|
||||
|
||||
// FIXME: Handle abs/neg of individual components.
|
||||
// FIXME: Handle swizzling with op_sel
|
||||
Mods |= SISrcMods::OP_SEL_1;
|
||||
|
||||
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
|
||||
|
@ -2660,6 +2660,15 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
|
||||
SDValue Vec = Op.getOperand(0);
|
||||
SDValue Idx = Op.getOperand(1);
|
||||
|
||||
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
|
||||
|
||||
// Make sure we we do any optimizations that will make it easier to fold
|
||||
// source modifiers before obscuring it with bit operations.
|
||||
|
||||
// XXX - Why doesn't this get called when vector_shuffle is expanded?
|
||||
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
|
||||
return Combined;
|
||||
|
||||
if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
|
||||
SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
|
||||
|
||||
|
@ -181,8 +181,7 @@ bb:
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
|
||||
; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
|
||||
; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
|
||||
define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
|
||||
bb:
|
||||
%vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
|
||||
@ -260,6 +259,434 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
%vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
|
||||
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%vec2.elt1 = extractelement <2 x half> %vec2, i32 1
|
||||
%neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
|
||||
|
||||
%neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_vector_scalar_hi:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
|
||||
define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
|
||||
|
||||
%vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
|
||||
|
||||
%vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
|
||||
%result = add <2 x i16> %vec0, %vec1.elt1.broadcast
|
||||
|
||||
store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
|
||||
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
%neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
|
||||
%neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
|
||||
%neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
|
||||
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
; GCN-NOT: xor
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
%neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
|
||||
%neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
; GCN-NOT: xor
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
%neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
%combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
; GCN-NOT: xor
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
%neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
%combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
; GCN-NOT: xor
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
%neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
%combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: or
|
||||
; GCN-NOT: xor
|
||||
|
||||
; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
|
||||
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
%neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
|
||||
%combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}bitcast_fneg_f32:
|
||||
; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
|
||||
define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%f32 = load volatile float, float addrspace(3)* undef, align 4
|
||||
%neg.f32 = fsub float -0.0, %f32
|
||||
%bc = bitcast float %neg.f32 to <2 x half>
|
||||
%result = fadd <2 x half> %vec0, %bc
|
||||
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
|
||||
; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
|
||||
define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
|
||||
%f32 = load volatile float, float addrspace(3)* undef, align 4
|
||||
%neg.f32 = fsub float -0.0, %f32
|
||||
%bc = bitcast float %neg.f32 to <2 x half>
|
||||
%shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
|
||||
%result = fadd <2 x half> %vec0, %shuf
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extract_from_i64:
|
||||
; GCN: v_lshl_or_b32
|
||||
; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
|
||||
define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
|
||||
%i64 = load volatile i64, i64 addrspace(1)* undef
|
||||
|
||||
%elt0 = trunc i64 %i64 to i16
|
||||
%hi = lshr i64 %i64, 16
|
||||
%elt1 = trunc i64 %hi to i16
|
||||
|
||||
%ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
|
||||
%ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
|
||||
%result = add <2 x i16> %vec0, %ins1
|
||||
store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Bitcast is final obstacle to identifying same source register
|
||||
; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: _or
|
||||
|
||||
; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
|
||||
; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
|
||||
define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%scalar0 = load volatile i16, i16 addrspace(1)* undef
|
||||
%shl = shl i16 %scalar0, 1
|
||||
%shl.bc = bitcast i16 %shl to half
|
||||
|
||||
%fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
|
||||
%shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
|
||||
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; Bitcast is final obstacle to identifying same source register
|
||||
; GCN-LABEL: {{^}}mix_elt_types_op_sel:
|
||||
; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
|
||||
; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: pack
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: shl
|
||||
; GCN-NOT: _or
|
||||
|
||||
; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
|
||||
; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
|
||||
define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
|
||||
bb:
|
||||
%lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
|
||||
%lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
|
||||
|
||||
%vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
|
||||
%vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
|
||||
%vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
|
||||
|
||||
%scalar0 = load volatile i16, i16 addrspace(1)* undef
|
||||
%scalar1 = load volatile half, half addrspace(1)* undef
|
||||
%shl = shl i16 %scalar0, 1
|
||||
%shl.bc = bitcast i16 %shl to half
|
||||
|
||||
%insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
|
||||
|
||||
%fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
|
||||
%insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
|
||||
|
||||
%result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
|
||||
store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
Loading…
Reference in New Issue
Block a user