1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[AMDGPU] DAG combine to produce V_PERM_B32

Differential Revision: https://reviews.llvm.org/D48099

llvm-svn: 334559
This commit is contained in:
Stanislav Mekhanoshin 2018-06-12 23:50:37 +00:00
parent 6ab778401f
commit 017429bc2b
6 changed files with 413 additions and 1 deletions

View File

@ -4119,6 +4119,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MAD_I24)
NODE_NAME_CASE(MAD_I64_I32)
NODE_NAME_CASE(MAD_U64_U32)
NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
NODE_NAME_CASE(EXPORT)
NODE_NAME_CASE(EXPORT_DONE)
@ -4374,6 +4375,34 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setHighBits(32 - MaxValBits);
break;
}
case AMDGPUISD::PERM: {
ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
if (!CMask)
return;
KnownBits LHSKnown, RHSKnown;
DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
unsigned Sel = CMask->getZExtValue();
for (unsigned I = 0; I < 32; I += 8) {
unsigned ByteMask = 0xff << I;
unsigned SelBits = Sel & 0xff;
if (SelBits < 4) {
Known.One |= RHSKnown.One & ByteMask;
Known.Zero |= RHSKnown.Zero & ByteMask;
} else if (SelBits < 7) {
Known.One |= LHSKnown.One & ByteMask;
Known.Zero |= LHSKnown.Zero & ByteMask;
} else if (SelBits == 0x0c) {
Known.Zero |= ByteMask;
} else if (SelBits > 0x0c) {
Known.One |= ByteMask;
}
Sel >>= 8;
}
break;
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {

View File

@ -402,6 +402,7 @@ enum NodeType : unsigned {
MAD_I64_I32,
MUL_LOHI_I24,
MUL_LOHI_U24,
PERM,
TEXTURE_FETCH,
EXPORT, // exp on SI+
EXPORT_DONE, // exp on SI+ with done bit set

View File

@ -339,6 +339,8 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;

View File

@ -6135,6 +6135,71 @@ static bool isBoolSGPR(SDValue V) {
return false;
}
// If a constant has all zeroes or all ones within each byte return it.
// Otherwise return 0.
static uint32_t getConstantPermuteMask(uint32_t C) {
// 0xff for any zero byte in the mask
uint32_t ZeroByteMask = 0;
if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
if ((NonZeroByteMask & C) != NonZeroByteMask)
return 0; // Partial bytes selected.
return C;
}
// Check if a node selects whole bytes from its operand 0 starting at a byte
// boundary while masking the rest. Returns select mask as in the v_perm_b32
// or -1 if not succeeded.
// Note byte select encoding:
// value 0-3 selects corresponding source byte;
// value 0xc selects zero;
// value 0xff selects 0xff.
static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
assert(V.getValueSizeInBits() == 32);
if (V.getNumOperands() != 2)
return ~0;
ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
if (!N1)
return ~0;
uint32_t C = N1->getZExtValue();
switch (V.getOpcode()) {
default:
break;
case ISD::AND:
if (uint32_t ConstMask = getConstantPermuteMask(C)) {
return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
}
break;
case ISD::OR:
if (uint32_t ConstMask = getConstantPermuteMask(C)) {
return (0x03020100 & ~ConstMask) | ConstMask;
}
break;
case ISD::SHL:
if (C % 8)
return ~0;
return uint32_t((0x030201000c0c0c0cull << C) >> 32);
case ISD::SRL:
if (C % 8)
return ~0;
return uint32_t(0x0c0c0c0c03020100ull >> C);
}
return ~0;
}
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
@ -6181,6 +6246,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
}
}
}
// and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
isa<ConstantSDNode>(LHS.getOperand(2))) {
uint32_t Sel = getConstantPermuteMask(Mask);
if (!Sel)
return SDValue();
// Select 0xc for all zero bytes
Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
SDLoc DL(N);
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
}
}
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@ -6233,6 +6312,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
}
// and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
uint32_t LHSMask = getPermuteMask(DAG, LHS);
uint32_t RHSMask = getPermuteMask(DAG, RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
// Canonicalize the expression in an attempt to have fewer unique masks
// and therefore fewer registers used to hold the masks.
if (LHSMask > RHSMask) {
std::swap(LHSMask, RHSMask);
std::swap(LHS, RHS);
}
// Select 0xc for each lane used from source operand. Zero has 0xc mask
// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
// Check of we need to combine values from two sources within a byte.
if (!(LHSUsedLanes & RHSUsedLanes) &&
// If we select high and lower word keep it for SDWA.
// TODO: teach SDWA to work with v_perm_b32 and remove the check.
!(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
// Each byte in each mask is either selector mask 0-3, or has higher
// bits set in either of masks, which can be 0xff for 0xff or 0x0c for
// zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
// mask which is not 0xff wins. By anding both masks we have a correct
// result except that 0x0c shall be corrected to give 0x0c only.
uint32_t Mask = LHSMask & RHSMask;
for (unsigned I = 0; I < 32; I += 8) {
uint32_t ByteSel = 0xff << I;
if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
Mask &= (0x0c << I) & 0xffffffff;
}
// Add 4 to each active LHS lane. It will not affect any existing 0xff
// or 0x0c.
uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
SDLoc DL(N);
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
LHS.getOperand(0), RHS.getOperand(0),
DAG.getConstant(Sel, DL, MVT::i32));
}
}
}
return SDValue();
}
@ -6268,6 +6395,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
return SDValue();
}
// or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
LHS.getOpcode() == AMDGPUISD::PERM &&
isa<ConstantSDNode>(LHS.getOperand(2))) {
uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
if (!Sel)
return SDValue();
Sel |= LHS.getConstantOperandVal(2);
SDLoc DL(N);
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
}
// or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
uint32_t LHSMask = getPermuteMask(DAG, LHS);
uint32_t RHSMask = getPermuteMask(DAG, RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
// Canonicalize the expression in an attempt to have fewer unique masks
// and therefore fewer registers used to hold the masks.
if (LHSMask > RHSMask) {
std::swap(LHSMask, RHSMask);
std::swap(LHS, RHS);
}
// Select 0xc for each lane used from source operand. Zero has 0xc mask
// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
// Check of we need to combine values from two sources within a byte.
if (!(LHSUsedLanes & RHSUsedLanes) &&
// If we select high and lower word keep it for SDWA.
// TODO: teach SDWA to work with v_perm_b32 and remove the check.
!(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
// Kill zero bytes selected by other mask. Zero value is 0xc.
LHSMask &= ~RHSUsedLanes;
RHSMask &= ~LHSUsedLanes;
// Add 4 to each active LHS lane
LHSMask |= LHSUsedLanes & 0x04040404;
// Combine masks
uint32_t Sel = LHSMask | RHSMask;
SDLoc DL(N);
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
LHS.getOperand(0), RHS.getOperand(0),
DAG.getConstant(Sel, DL, MVT::i32));
}
}
}
if (VT != MVT::i64)
return SDValue();

View File

@ -449,7 +449,7 @@ def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
} // End SubtargetPredicate = isVI
let Predicates = [Has16BitInsts] in {

View File

@ -0,0 +1,199 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; GCN-LABEL: {{^}}lsh8_or_and:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050400
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = shl i32 %tmp, 8
%tmp3 = and i32 %arg1, 255
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}lsr24_or_and:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = lshr i32 %tmp, 24
%tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}and_or_lsr24:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00
%tmp3 = lshr i32 %arg1, 24
%tmp4 = or i32 %tmp2, %tmp3
%tmp5 = xor i32 %tmp4, -2147483648
store i32 %tmp5, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}and_or_and:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020500
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = and i32 %tmp, -16711936
%tmp3 = and i32 %arg1, 16711935
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}lsh8_or_lsr24:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050403
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = shl i32 %tmp, 8
%tmp3 = lshr i32 %arg1, 24
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}lsh16_or_lsr24:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x5040c03
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = shl i32 %tmp, 16
%tmp3 = lshr i32 %arg1, 24
%tmp4 = or i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}and_xor_and:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = and i32 %tmp, -16776961
%tmp3 = and i32 %arg1, 16776960
%tmp4 = xor i32 %tmp2, %tmp3
store i32 %tmp4, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}and_or_or_and:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%and = and i32 %tmp, 16711935 ; 0x00ff00ff
%tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00
%tmp2 = or i32 %tmp1, -65536
%tmp3 = or i32 %tmp2, %and
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}and_or_and_shl:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = shl i32 %tmp, 16
%tmp3 = and i32 %arg1, 65535
%tmp4 = or i32 %tmp2, %tmp3
%and = and i32 %tmp4, 4278190335
store i32 %and, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}or_and_or:
; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%or1 = or i32 %tmp, 16776960 ; 0x00ffff00
%or2 = or i32 %arg1, 4278190335 ; 0xff0000ff
%and = and i32 %or1, %or2
store i32 %and, i32 addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}known_ffff0500:
; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%load = load i32, i32 addrspace(1)* %gep, align 4
%mask1 = or i32 %arg1, 32768 ; 0x8000
%mask2 = or i32 %load, 4
%and = and i32 %mask2, 16711935 ; 0x00ff00ff
%tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00
%tmp2 = or i32 %tmp1, 4294901760 ; 0xffff0000
%tmp3 = or i32 %tmp2, %and
store i32 %tmp3, i32 addrspace(1)* %gep, align 4
%v = and i32 %tmp3, 4294934532 ; 0xffff8004
store i32 %v, i32 addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}known_050c0c00:
; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 4{{$}}
; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
%tmp = load i32, i32 addrspace(1)* %gep, align 4
%tmp2 = shl i32 %tmp, 16
%mask = or i32 %arg1, 4
%tmp3 = and i32 %mask, 65535
%tmp4 = or i32 %tmp2, %tmp3
%and = and i32 %tmp4, 4278190335
store i32 %and, i32 addrspace(1)* %gep, align 4
%v = and i32 %and, 16776964
store i32 %v, i32 addrspace(1)* %arg, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()