[AMDGPU] DAG combine to produce V_PERM_B32

Differential Revision: https://reviews.llvm.org/D48099 llvm-svn: 334559
2024-11-23 11:13:28 +01:00 · 2018-06-12 23:50:37 +00:00 · 2018-06-12 23:50:37 +00:00 · 017429bc2b
commit 017429bc2b
parent 6ab778401f
6 changed files with 413 additions and 1 deletions
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -4119,6 +4119,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(MAD_I24)
  NODE_NAME_CASE(MAD_I64_I32)
  NODE_NAME_CASE(MAD_U64_U32)
+  NODE_NAME_CASE(PERM)
  NODE_NAME_CASE(TEXTURE_FETCH)
  NODE_NAME_CASE(EXPORT)
  NODE_NAME_CASE(EXPORT_DONE)
@ -4374,6 +4375,34 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
      Known.Zero.setHighBits(32 - MaxValBits);
    break;
  }
+  case AMDGPUISD::PERM: {
+    ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CMask)
+      return;
+
+    KnownBits LHSKnown, RHSKnown;
+    DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
+    DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
+    unsigned Sel = CMask->getZExtValue();
+
+    for (unsigned I = 0; I < 32; I += 8) {
+      unsigned ByteMask = 0xff << I;
+      unsigned SelBits = Sel & 0xff;
+      if (SelBits < 4) {
+        Known.One |= RHSKnown.One & ByteMask;
+        Known.Zero |= RHSKnown.Zero & ByteMask;
+      } else if (SelBits < 7) {
+        Known.One |= LHSKnown.One & ByteMask;
+        Known.Zero |= LHSKnown.Zero & ByteMask;
+      } else if (SelBits == 0x0c) {
+        Known.Zero |= ByteMask;
+      } else if (SelBits > 0x0c) {
+        Known.One |= ByteMask;
+      }
+      Sel >>= 8;
+    }
+    break;
+  }
  case ISD::INTRINSIC_WO_CHAIN: {
    unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    switch (IID) {
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -402,6 +402,7 @@ enum NodeType : unsigned {
  MAD_I64_I32,
  MUL_LOHI_I24,
  MUL_LOHI_U24,
+  PERM,
  TEXTURE_FETCH,
  EXPORT, // exp on SI+
  EXPORT_DONE, // exp on SI+ with done bit set
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -339,6 +339,8 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,

 def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;

+def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
+
 def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
                      SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                      [SDNPHasChain, SDNPInGlue]>;
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -6135,6 +6135,71 @@ static bool isBoolSGPR(SDValue V) {
  return false;
 }

+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+  // 0xff for any zero byte in the mask
+  uint32_t ZeroByteMask = 0;
+  if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+  if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+  if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+  if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+  uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+  if ((NonZeroByteMask & C) != NonZeroByteMask)
+    return 0; // Partial bytes selected.
+  return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+  assert(V.getValueSizeInBits() == 32);
+
+  if (V.getNumOperands() != 2)
+    return ~0;
+
+  ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!N1)
+    return ~0;
+
+  uint32_t C = N1->getZExtValue();
+
+  switch (V.getOpcode()) {
+  default:
+    break;
+  case ISD::AND:
+    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+      return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+    }
+    break;
+
+  case ISD::OR:
+    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+      return (0x03020100 & ~ConstMask) | ConstMask;
+    }
+    break;
+
+  case ISD::SHL:
+    if (C % 8)
+      return ~0;
+
+    return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+  case ISD::SRL:
+    if (C % 8)
+      return ~0;
+
+    return uint32_t(0x0c0c0c0c03020100ull >> C);
+  }
+
+  return ~0;
+}
+
 SDValue SITargetLowering::performAndCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const {
  if (DCI.isBeforeLegalize())
@ -6181,6 +6246,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
        }
      }
    }
+
+    // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+    if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+        isa<ConstantSDNode>(LHS.getOperand(2))) {
+      uint32_t Sel = getConstantPermuteMask(Mask);
+      if (!Sel)
+        return SDValue();
+
+      // Select 0xc for all zero bytes
+      Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+      SDLoc DL(N);
+      return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+                         LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+    }
  }

  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@ -6233,6 +6312,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
                           LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
  }

+  // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+    uint32_t LHSMask = getPermuteMask(DAG, LHS);
+    uint32_t RHSMask = getPermuteMask(DAG, RHS);
+    if (LHSMask != ~0u && RHSMask != ~0u) {
+      // Canonicalize the expression in an attempt to have fewer unique masks
+      // and therefore fewer registers used to hold the masks.
+      if (LHSMask > RHSMask) {
+        std::swap(LHSMask, RHSMask);
+        std::swap(LHS, RHS);
+      }
+
+      // Select 0xc for each lane used from source operand. Zero has 0xc mask
+      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+      // Check of we need to combine values from two sources within a byte.
+      if (!(LHSUsedLanes & RHSUsedLanes) &&
+          // If we select high and lower word keep it for SDWA.
+          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+        // Each byte in each mask is either selector mask 0-3, or has higher
+        // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+        // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+        // mask which is not 0xff wins. By anding both masks we have a correct
+        // result except that 0x0c shall be corrected to give 0x0c only.
+        uint32_t Mask = LHSMask & RHSMask;
+        for (unsigned I = 0; I < 32; I += 8) {
+          uint32_t ByteSel = 0xff << I;
+          if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+            Mask &= (0x0c << I) & 0xffffffff;
+        }
+
+        // Add 4 to each active LHS lane. It will not affect any existing 0xff
+        // or 0x0c.
+        uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+        SDLoc DL(N);
+
+        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+                           LHS.getOperand(0), RHS.getOperand(0),
+                           DAG.getConstant(Sel, DL, MVT::i32));
+      }
+    }
+  }
+
  return SDValue();
 }

@ -6268,6 +6395,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
    return SDValue();
  }

+  // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+  if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+      LHS.getOpcode() == AMDGPUISD::PERM &&
+      isa<ConstantSDNode>(LHS.getOperand(2))) {
+    uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+    if (!Sel)
+      return SDValue();
+
+    Sel |= LHS.getConstantOperandVal(2);
+    SDLoc DL(N);
+    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+                       LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+  }
+
+  // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+    uint32_t LHSMask = getPermuteMask(DAG, LHS);
+    uint32_t RHSMask = getPermuteMask(DAG, RHS);
+    if (LHSMask != ~0u && RHSMask != ~0u) {
+      // Canonicalize the expression in an attempt to have fewer unique masks
+      // and therefore fewer registers used to hold the masks.
+      if (LHSMask > RHSMask) {
+        std::swap(LHSMask, RHSMask);
+        std::swap(LHS, RHS);
+      }
+
+      // Select 0xc for each lane used from source operand. Zero has 0xc mask
+      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+      // Check of we need to combine values from two sources within a byte.
+      if (!(LHSUsedLanes & RHSUsedLanes) &&
+          // If we select high and lower word keep it for SDWA.
+          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+        // Kill zero bytes selected by other mask. Zero value is 0xc.
+        LHSMask &= ~RHSUsedLanes;
+        RHSMask &= ~LHSUsedLanes;
+        // Add 4 to each active LHS lane
+        LHSMask |= LHSUsedLanes & 0x04040404;
+        // Combine masks
+        uint32_t Sel = LHSMask | RHSMask;
+        SDLoc DL(N);
+
+        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+                           LHS.getOperand(0), RHS.getOperand(0),
+                           DAG.getConstant(Sel, DL, MVT::i32));
+      }
+    }
+  }
+
  if (VT != MVT::i64)
    return SDValue();

--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@ -449,7 +449,7 @@ def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;

-def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>;
 } // End SubtargetPredicate = isVI

 let Predicates = [Has16BitInsts] in {
--- a/test/CodeGen/AMDGPU/permute.ll
+++ b/test/CodeGen/AMDGPU/permute.ll
@ -0,0 +1,199 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}lsh8_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050400
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsh8_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = shl i32 %tmp, 8
+  %tmp3 = and i32 %arg1, 255
+  %tmp4 = or i32 %tmp2, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lsr24_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsr24_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = lshr i32 %tmp, 24
+  %tmp3 = and i32 %arg1, 4294967040 ; 0xffffff00
+  %tmp4 = or i32 %tmp2, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_or_lsr24:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7060503
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = and i32 %tmp, 4294967040 ; 0xffffff00
+  %tmp3 = lshr i32 %arg1, 24
+  %tmp4 = or i32 %tmp2, %tmp3
+  %tmp5 = xor i32 %tmp4, -2147483648
+  store i32 %tmp5, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020500
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = and i32 %tmp, -16711936
+  %tmp3 = and i32 %arg1, 16711935
+  %tmp4 = or i32 %tmp2, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lsh8_or_lsr24:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x6050403
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsh8_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = shl i32 %tmp, 8
+  %tmp3 = lshr i32 %arg1, 24
+  %tmp4 = or i32 %tmp2, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lsh16_or_lsr24:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x5040c03
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @lsh16_or_lsr24(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = shl i32 %tmp, 16
+  %tmp3 = lshr i32 %arg1, 24
+  %tmp4 = or i32 %tmp2, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_xor_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_xor_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = and i32 %tmp, -16776961
+  %tmp3 = and i32 %arg1, 16776960
+  %tmp4 = xor i32 %tmp2, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_or_or_and:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %and = and i32 %tmp, 16711935     ; 0x00ff00ff
+  %tmp1 = and i32 %arg1, 4294967040 ; 0xffffff00
+  %tmp2 = or i32 %tmp1, -65536
+  %tmp3 = or i32 %tmp2, %and
+  store i32 %tmp3, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_or_and_shl:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @and_or_and_shl(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = shl i32 %tmp, 16
+  %tmp3 = and i32 %arg1, 65535
+  %tmp4 = or i32 %tmp2, %tmp3
+  %and = and i32 %tmp4, 4278190335
+  store i32 %and, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}or_and_or:
+; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7020104
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+define amdgpu_kernel void @or_and_or(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %or1 = or i32 %tmp, 16776960    ; 0x00ffff00
+  %or2 = or i32 %arg1, 4278190335 ; 0xff0000ff
+  %and = and i32 %or1, %or2
+  store i32 %and, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}known_ffff0500:
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
+; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %load = load i32, i32 addrspace(1)* %gep, align 4
+  %mask1 = or i32 %arg1, 32768 ; 0x8000
+  %mask2 = or i32 %load, 4
+  %and = and i32 %mask2, 16711935     ; 0x00ff00ff
+  %tmp1 = and i32 %mask1, 4294967040 ; 0xffffff00
+  %tmp2 = or i32 %tmp1, 4294901760   ; 0xffff0000
+  %tmp3 = or i32 %tmp2, %and
+  store i32 %tmp3, i32 addrspace(1)* %gep, align 4
+  %v = and i32 %tmp3, 4294934532 ; 0xffff8004
+  store i32 %v, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}known_050c0c00:
+; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x50c0c00
+; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 4{{$}}
+; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
+define amdgpu_kernel void @known_050c0c00(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr i32, i32 addrspace(1)* %arg, i32 %id
+  %tmp = load i32, i32 addrspace(1)* %gep, align 4
+  %tmp2 = shl i32 %tmp, 16
+  %mask = or i32 %arg1, 4
+  %tmp3 = and i32 %mask, 65535
+  %tmp4 = or i32 %tmp2, %tmp3
+  %and = and i32 %tmp4, 4278190335
+  store i32 %and, i32 addrspace(1)* %gep, align 4
+  %v = and i32 %and, 16776964
+  store i32 %v, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()