mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[AMDGPU] DAG combine to produce V_PERM_B32
Differential Revision: https://reviews.llvm.org/D48099 llvm-svn: 334559
This commit is contained in:
parent
6ab778401f
commit
017429bc2b
@ -4119,6 +4119,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(MAD_I24)
|
||||
NODE_NAME_CASE(MAD_I64_I32)
|
||||
NODE_NAME_CASE(MAD_U64_U32)
|
||||
NODE_NAME_CASE(PERM)
|
||||
NODE_NAME_CASE(TEXTURE_FETCH)
|
||||
NODE_NAME_CASE(EXPORT)
|
||||
NODE_NAME_CASE(EXPORT_DONE)
|
||||
@ -4374,6 +4375,34 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
|
||||
Known.Zero.setHighBits(32 - MaxValBits);
|
||||
break;
|
||||
}
|
||||
case AMDGPUISD::PERM: {
|
||||
ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
|
||||
if (!CMask)
|
||||
return;
|
||||
|
||||
KnownBits LHSKnown, RHSKnown;
|
||||
DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
|
||||
DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
|
||||
unsigned Sel = CMask->getZExtValue();
|
||||
|
||||
for (unsigned I = 0; I < 32; I += 8) {
|
||||
unsigned ByteMask = 0xff << I;
|
||||
unsigned SelBits = Sel & 0xff;
|
||||
if (SelBits < 4) {
|
||||
Known.One |= RHSKnown.One & ByteMask;
|
||||
Known.Zero |= RHSKnown.Zero & ByteMask;
|
||||
} else if (SelBits < 7) {
|
||||
Known.One |= LHSKnown.One & ByteMask;
|
||||
Known.Zero |= LHSKnown.Zero & ByteMask;
|
||||
} else if (SelBits == 0x0c) {
|
||||
Known.Zero |= ByteMask;
|
||||
} else if (SelBits > 0x0c) {
|
||||
Known.One |= ByteMask;
|
||||
}
|
||||
Sel >>= 8;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ISD::INTRINSIC_WO_CHAIN: {
|
||||
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
||||
switch (IID) {
|
||||
|
@ -402,6 +402,7 @@ enum NodeType : unsigned {
|
||||
MAD_I64_I32,
|
||||
MUL_LOHI_I24,
|
||||
MUL_LOHI_U24,
|
||||
PERM,
|
||||
TEXTURE_FETCH,
|
||||
EXPORT, // exp on SI+
|
||||
EXPORT_DONE, // exp on SI+ with done bit set
|
||||
|
@ -339,6 +339,8 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
|
||||
|
||||
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
|
||||
|
||||
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
|
||||
|
||||
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
|
||||
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
|
||||
[SDNPHasChain, SDNPInGlue]>;
|
||||
|
@ -6135,6 +6135,71 @@ static bool isBoolSGPR(SDValue V) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If a constant has all zeroes or all ones within each byte return it.
|
||||
// Otherwise return 0.
|
||||
static uint32_t getConstantPermuteMask(uint32_t C) {
|
||||
// 0xff for any zero byte in the mask
|
||||
uint32_t ZeroByteMask = 0;
|
||||
if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
|
||||
if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
|
||||
if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
|
||||
if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
|
||||
uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
|
||||
if ((NonZeroByteMask & C) != NonZeroByteMask)
|
||||
return 0; // Partial bytes selected.
|
||||
return C;
|
||||
}
|
||||
|
||||
// Check if a node selects whole bytes from its operand 0 starting at a byte
|
||||
// boundary while masking the rest. Returns select mask as in the v_perm_b32
|
||||
// or -1 if not succeeded.
|
||||
// Note byte select encoding:
|
||||
// value 0-3 selects corresponding source byte;
|
||||
// value 0xc selects zero;
|
||||
// value 0xff selects 0xff.
|
||||
static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
|
||||
assert(V.getValueSizeInBits() == 32);
|
||||
|
||||
if (V.getNumOperands() != 2)
|
||||
return ~0;
|
||||
|
||||
ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
|
||||
if (!N1)
|
||||
return ~0;
|
||||
|
||||
uint32_t C = N1->getZExtValue();
|
||||
|
||||
switch (V.getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
case ISD::AND:
|
||||
if (uint32_t ConstMask = getConstantPermuteMask(C)) {
|
||||
return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
|
||||
}
|
||||
break;
|
||||
|
||||
case ISD::OR:
|
||||
if (uint32_t ConstMask = getConstantPermuteMask(C)) {
|
||||
return (0x03020100 & ~ConstMask) | ConstMask;
|
||||
}
|
||||
break;
|
||||
|
||||
case ISD::SHL:
|
||||
if (C % 8)
|
||||
return ~0;
|
||||
|
||||
return uint32_t((0x030201000c0c0c0cull << C) >> 32);
|
||||
|
||||
case ISD::SRL:
|
||||
if (C % 8)
|
||||
return ~0;
|
||||
|
||||
return uint32_t(0x0c0c0c0c03020100ull >> C);
|
||||
}
|
||||
|
||||
return ~0;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performAndCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
if (DCI.isBeforeLegalize())
|
||||
@ -6181,6 +6246,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
|
||||
if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
|
||||
isa<ConstantSDNode>(LHS.getOperand(2))) {
|
||||
uint32_t Sel = getConstantPermuteMask(Mask);
|
||||
if (!Sel)
|
||||
return SDValue();
|
||||
|
||||
// Select 0xc for all zero bytes
|
||||
Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
|
||||
SDLoc DL(N);
|
||||
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
|
||||
LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
|
||||
}
|
||||
}
|
||||
|
||||
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
|
||||
@ -6233,6 +6312,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
|
||||
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
|
||||
}
|
||||
|
||||
// and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
|
||||
N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
|
||||
uint32_t LHSMask = getPermuteMask(DAG, LHS);
|
||||
uint32_t RHSMask = getPermuteMask(DAG, RHS);
|
||||
if (LHSMask != ~0u && RHSMask != ~0u) {
|
||||
// Canonicalize the expression in an attempt to have fewer unique masks
|
||||
// and therefore fewer registers used to hold the masks.
|
||||
if (LHSMask > RHSMask) {
|
||||
std::swap(LHSMask, RHSMask);
|
||||
std::swap(LHS, RHS);
|
||||
}
|
||||
|
||||
// Select 0xc for each lane used from source operand. Zero has 0xc mask
|
||||
// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
|
||||
uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
|
||||
uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
|
||||
|
||||
// Check of we need to combine values from two sources within a byte.
|
||||
if (!(LHSUsedLanes & RHSUsedLanes) &&
|
||||
// If we select high and lower word keep it for SDWA.
|
||||
// TODO: teach SDWA to work with v_perm_b32 and remove the check.
|
||||
!(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
|
||||
// Each byte in each mask is either selector mask 0-3, or has higher
|
||||
// bits set in either of masks, which can be 0xff for 0xff or 0x0c for
|
||||
// zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
|
||||
// mask which is not 0xff wins. By anding both masks we have a correct
|
||||
// result except that 0x0c shall be corrected to give 0x0c only.
|
||||
uint32_t Mask = LHSMask & RHSMask;
|
||||
for (unsigned I = 0; I < 32; I += 8) {
|
||||
uint32_t ByteSel = 0xff << I;
|
||||
if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
|
||||
Mask &= (0x0c << I) & 0xffffffff;
|
||||
}
|
||||
|
||||
// Add 4 to each active LHS lane. It will not affect any existing 0xff
|
||||
// or 0x0c.
|
||||
uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
|
||||
SDLoc DL(N);
|
||||
|
||||
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
|
||||
LHS.getOperand(0), RHS.getOperand(0),
|
||||
DAG.getConstant(Sel, DL, MVT::i32));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -6268,6 +6395,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
|
||||
if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
|
||||
LHS.getOpcode() == AMDGPUISD::PERM &&
|
||||
isa<ConstantSDNode>(LHS.getOperand(2))) {
|
||||
uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
|
||||
if (!Sel)
|
||||
return SDValue();
|
||||
|
||||
Sel |= LHS.getConstantOperandVal(2);
|
||||
SDLoc DL(N);
|
||||
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
|
||||
LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
|
||||
}
|
||||
|
||||
// or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
|
||||
N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
|
||||
uint32_t LHSMask = getPermuteMask(DAG, LHS);
|
||||
uint32_t RHSMask = getPermuteMask(DAG, RHS);
|
||||
if (LHSMask != ~0u && RHSMask != ~0u) {
|
||||
// Canonicalize the expression in an attempt to have fewer unique masks
|
||||
// and therefore fewer registers used to hold the masks.
|
||||
if (LHSMask > RHSMask) {
|
||||
std::swap(LHSMask, RHSMask);
|
||||
std::swap(LHS, RHS);
|
||||
}
|
||||
|
||||
// Select 0xc for each lane used from source operand. Zero has 0xc mask
|
||||
// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
|
||||
uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
|
||||
uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
|
||||
|
||||
// Check of we need to combine values from two sources within a byte.
|
||||
if (!(LHSUsedLanes & RHSUsedLanes) &&
|
||||
// If we select high and lower word keep it for SDWA.
|
||||
// TODO: teach SDWA to work with v_perm_b32 and remove the check.
|
||||
!(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
|
||||
// Kill zero bytes selected by other mask. Zero value is 0xc.
|
||||
LHSMask &= ~RHSUsedLanes;
|
||||
RHSMask &= ~LHSUsedLanes;
|
||||
// Add 4 to each active LHS lane
|
||||
LHSMask |= LHSUsedLanes & 0x04040404;
|
||||
// Combine masks
|
||||
uint32_t Sel = LHSMask | RHSMask;
|
||||
SDLoc DL(N);
|
||||
|
||||
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
|
||||
LHS.getOperand(0), RHS.getOperand(0),
|
||||
DAG.getConstant(Sel, DL, MVT::i32));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (VT != MVT::i64)
|
||||
return SDValue();
|
||||
|
||||
|
@ -449,7 +449,7 @@ def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
|
||||
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
|
||||
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
|
||||
|
||||
def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
|
||||
def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
|
||||
} // End SubtargetPredicate = isVI
|
||||
|
||||
let Predicates = [Has16BitInsts] in {
|
||||
|
199
test/CodeGen/AMDGPU/permute.ll
Normal file
199
test/CodeGen/AMDGPU/permute.ll
Normal file
@ -0,0 +1,199 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}lsh8_or_and:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050400
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = shl i32 %tmp, 8
|
||||
%tmp3 = and i32 %arg1, 255
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lsr24_or_and:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = lshr i32 %tmp, 24
|
||||
%tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_or_lsr24:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00
|
||||
%tmp3 = lshr i32 %arg1, 24
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
%tmp5 = xor i32 %tmp4, -2147483648
|
||||
store i32 %tmp5, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_or_and:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020500
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = and i32 %tmp, -16711936
|
||||
%tmp3 = and i32 %arg1, 16711935
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lsh8_or_lsr24:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050403
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = shl i32 %tmp, 8
|
||||
%tmp3 = lshr i32 %arg1, 24
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}lsh16_or_lsr24:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x5040c03
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = shl i32 %tmp, 16
|
||||
%tmp3 = lshr i32 %arg1, 24
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_xor_and:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = and i32 %tmp, -16776961
|
||||
%tmp3 = and i32 %arg1, 16776960
|
||||
%tmp4 = xor i32 %tmp2, %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_or_or_and:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%and = and i32 %tmp, 16711935 ; 0x00ff00ff
|
||||
%tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00
|
||||
%tmp2 = or i32 %tmp1, -65536
|
||||
%tmp3 = or i32 %tmp2, %and
|
||||
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}and_or_and_shl:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = shl i32 %tmp, 16
|
||||
%tmp3 = and i32 %arg1, 65535
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
%and = and i32 %tmp4, 4278190335
|
||||
store i32 %and, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}or_and_or:
|
||||
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%or1 = or i32 %tmp, 16776960 ; 0x00ffff00
|
||||
%or2 = or i32 %arg1, 4278190335 ; 0xff0000ff
|
||||
%and = and i32 %or1, %or2
|
||||
store i32 %and, i32 addrspace(1)* %gep, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}known_ffff0500:
|
||||
; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
|
||||
; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%load = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%mask1 = or i32 %arg1, 32768 ; 0x8000
|
||||
%mask2 = or i32 %load, 4
|
||||
%and = and i32 %mask2, 16711935 ; 0x00ff00ff
|
||||
%tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00
|
||||
%tmp2 = or i32 %tmp1, 4294901760 ; 0xffff0000
|
||||
%tmp3 = or i32 %tmp2, %and
|
||||
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
|
||||
%v = and i32 %tmp3, 4294934532 ; 0xffff8004
|
||||
store i32 %v, i32 addrspace(1)* %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}known_050c0c00:
|
||||
; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
|
||||
; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 4{{$}}
|
||||
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
|
||||
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
|
||||
define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
|
||||
bb:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
|
||||
%tmp = load i32, i32 addrspace(1)* %gep, align 4
|
||||
%tmp2 = shl i32 %tmp, 16
|
||||
%mask = or i32 %arg1, 4
|
||||
%tmp3 = and i32 %mask, 65535
|
||||
%tmp4 = or i32 %tmp2, %tmp3
|
||||
%and = and i32 %tmp4, 4278190335
|
||||
store i32 %and, i32 addrspace(1)* %gep, align 4
|
||||
%v = and i32 %and, 16776964
|
||||
store i32 %v, i32 addrspace(1)* %arg, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
Loading…
Reference in New Issue
Block a user