mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[DAG] MatchRotate - Add funnel shift by variable support
Followup to D75114, this patch reuses the existing MatchRotate ROTL/ROTR rotation pattern code to also recognize the more general FSHL/FSHR funnel shift patterns when we have variable shift amounts, matched with MatchFunnelPosNeg which acts in an (almost) equivalent manner to MatchRotatePosNeg.
This commit is contained in:
parent
51df7978a6
commit
1f827bf828
@ -555,6 +555,10 @@ namespace {
|
||||
SDValue InnerPos, SDValue InnerNeg,
|
||||
unsigned PosOpcode, unsigned NegOpcode,
|
||||
const SDLoc &DL);
|
||||
SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
|
||||
SDValue InnerPos, SDValue InnerNeg,
|
||||
unsigned PosOpcode, unsigned NegOpcode,
|
||||
const SDLoc &DL);
|
||||
SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
|
||||
SDValue MatchLoadCombine(SDNode *N);
|
||||
SDValue MatchStoreCombine(StoreSDNode *N);
|
||||
@ -6319,6 +6323,33 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// A subroutine of MatchRotate used once we have found an OR of two opposite
|
||||
// shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
|
||||
// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
|
||||
// former being preferred if supported. InnerPos and InnerNeg are Pos and
|
||||
// Neg with outer conversions stripped away.
|
||||
// TODO: Merge with MatchRotatePosNeg.
|
||||
SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
|
||||
SDValue Neg, SDValue InnerPos,
|
||||
SDValue InnerNeg, unsigned PosOpcode,
|
||||
unsigned NegOpcode, const SDLoc &DL) {
|
||||
// fold (or (shl x0, (*ext y)),
|
||||
// (srl x1, (*ext (sub 32, y)))) ->
|
||||
// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
|
||||
//
|
||||
// fold (or (shl x0, (*ext (sub 32, y))),
|
||||
// (srl x1, (*ext y))) ->
|
||||
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
|
||||
EVT VT = N0.getValueType();
|
||||
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
|
||||
bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
|
||||
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
|
||||
HasPos ? Pos : Neg);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// MatchRotate - Handle an 'or' of two operands. If this is one of the many
|
||||
// idioms for rotate, and if the target supports rotation instructions, generate
|
||||
// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
|
||||
@ -6444,10 +6475,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
|
||||
return Res;
|
||||
}
|
||||
|
||||
// TODO: Handle variable funnel shifts.
|
||||
if (!IsRotate)
|
||||
return SDValue();
|
||||
|
||||
// If there is a mask here, and we have a variable shift, we can't be sure
|
||||
// that we're masking out the right stuff.
|
||||
if (LHSMask.getNode() || RHSMask.getNode())
|
||||
@ -6468,13 +6495,29 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
|
||||
RExtOp0 = RHSShiftAmt.getOperand(0);
|
||||
}
|
||||
|
||||
SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
|
||||
LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
|
||||
if (IsRotate && (HasROTL || HasROTR)) {
|
||||
SDValue TryL =
|
||||
MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
|
||||
RExtOp0, ISD::ROTL, ISD::ROTR, DL);
|
||||
if (TryL)
|
||||
return TryL;
|
||||
|
||||
SDValue TryR =
|
||||
MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
|
||||
LExtOp0, ISD::ROTR, ISD::ROTL, DL);
|
||||
if (TryR)
|
||||
return TryR;
|
||||
}
|
||||
|
||||
SDValue TryL =
|
||||
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
|
||||
LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
|
||||
if (TryL)
|
||||
return TryL;
|
||||
|
||||
SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
|
||||
RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
|
||||
SDValue TryR =
|
||||
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
|
||||
RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
|
||||
if (TryR)
|
||||
return TryR;
|
||||
|
||||
|
@ -17,11 +17,10 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_sub_i32 s8, 32, s2
|
||||
; SI-NEXT: s_lshl_b32 s3, s0, s2
|
||||
; SI-NEXT: s_lshr_b32 s1, s1, s8
|
||||
; SI-NEXT: s_or_b32 s1, s3, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: s_sub_i32 s1, 32, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
@ -34,14 +33,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; VI-NEXT: s_lshr_b32 s1, s1, s3
|
||||
; VI-NEXT: s_or_b32 s0, s0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: s_sub_i32 s1, 32, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
@ -53,14 +51,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_sub_i32 s3, 32, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, s3
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_sub_i32 s1, 32, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v2
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
@ -68,7 +65,7 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
||||
;
|
||||
; R600-LABEL: fshl_i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
@ -77,9 +74,7 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T1.W, literal.x, PV.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHR T1.W, KC0[2].W, PV.W,
|
||||
; R600-NEXT: LSHL * T2.W, KC0[2].Z, T0.W,
|
||||
; R600-NEXT: OR_INT * T1.W, PS, PV.W,
|
||||
; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[2].Z, KC0[2].W, PV.W,
|
||||
; R600-NEXT: CNDE_INT T0.X, T0.W, KC0[2].Z, PV.W,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
@ -153,22 +148,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_sub_i32 s11, 32, s1
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: s_lshl_b32 s10, s3, s1
|
||||
; SI-NEXT: s_lshr_b32 s9, s9, s11
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s0
|
||||
; SI-NEXT: s_or_b32 s9, s10, s9
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: s_lshl_b32 s1, s2, s0
|
||||
; SI-NEXT: s_lshr_b32 s3, s8, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; SI-NEXT: s_or_b32 s1, s1, s3
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_sub_i32 s10, 32, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s3, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
@ -181,22 +174,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_sub_i32 s9, 32, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: s_lshl_b32 s8, s5, s1
|
||||
; VI-NEXT: s_lshr_b32 s7, s7, s9
|
||||
; VI-NEXT: s_sub_i32 s5, 32, s0
|
||||
; VI-NEXT: s_or_b32 s7, s8, s7
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: s_lshl_b32 s1, s4, s0
|
||||
; VI-NEXT: s_lshr_b32 s5, s6, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: s_or_b32 s1, s1, s5
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_sub_i32 s7, 32, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
@ -211,22 +202,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_sub_i32 s9, 32, s1
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: s_lshl_b32 s8, s5, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s7, s7, s9
|
||||
; GFX9-NEXT: s_sub_i32 s5, 32, s0
|
||||
; GFX9-NEXT: s_or_b32 s7, s8, s7
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s6, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: s_or_b32 s1, s1, s5
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_sub_i32 s7, 32, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_sub_i32 s1, 32, s0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
@ -236,29 +225,25 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
;
|
||||
; R600-LABEL: fshl_v2i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
|
||||
; R600-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[3].W, literal.x,
|
||||
; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHR T0.Z, KC0[3].Z, PV.W,
|
||||
; R600-NEXT: LSHL T2.W, KC0[3].X, T0.W, BS:VEC_021/SCL_122
|
||||
; R600-NEXT: SUB_INT * T3.W, literal.x, T1.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHR T0.Y, KC0[3].Y, PS,
|
||||
; R600-NEXT: LSHL T1.Z, KC0[2].W, T1.W,
|
||||
; R600-NEXT: OR_INT T2.W, PV.W, PV.Z,
|
||||
; R600-NEXT: SUB_INT T1.W, literal.x, PV.W,
|
||||
; R600-NEXT: AND_INT * T2.W, KC0[3].W, literal.y,
|
||||
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
|
||||
; R600-NEXT: SUB_INT T0.Z, literal.x, PS,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].X,
|
||||
; R600-NEXT: OR_INT T0.W, PV.Z, PV.Y,
|
||||
; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[2].W,
|
||||
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].X,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].W, KC0[3].Y, PV.Z,
|
||||
; R600-NEXT: SETE_INT * T1.W, T2.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[2].W,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
|
||||
@ -341,40 +326,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s8
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: s_sub_i32 s17, 32, s3
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_lshl_b32 s16, s11, s3
|
||||
; SI-NEXT: s_lshr_b32 s15, s15, s17
|
||||
; SI-NEXT: s_sub_i32 s11, 32, s2
|
||||
; SI-NEXT: s_or_b32 s15, s16, s15
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; SI-NEXT: s_lshl_b32 s3, s10, s2
|
||||
; SI-NEXT: s_lshr_b32 s11, s14, s11
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; SI-NEXT: s_or_b32 s3, s3, s11
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: s_sub_i32 s16, 32, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s16
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s11, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s14
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s10, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; SI-NEXT: s_lshl_b32 s2, s9, s1
|
||||
; SI-NEXT: s_lshr_b32 s3, s13, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; SI-NEXT: s_or_b32 s2, s2, s3
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_alignbit_b32 v0, s9, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: s_lshl_b32 s1, s8, s0
|
||||
; SI-NEXT: s_lshr_b32 s2, s12, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: s_or_b32 s1, s1, s2
|
||||
; SI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s8
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
@ -387,40 +368,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: s_and_b32 s3, s3, 31
|
||||
; VI-NEXT: s_sub_i32 s15, 32, s3
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_lshl_b32 s14, s7, s3
|
||||
; VI-NEXT: s_lshr_b32 s11, s11, s15
|
||||
; VI-NEXT: s_sub_i32 s7, 32, s2
|
||||
; VI-NEXT: s_or_b32 s11, s14, s11
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; VI-NEXT: s_lshl_b32 s3, s6, s2
|
||||
; VI-NEXT: s_lshr_b32 s7, s10, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; VI-NEXT: s_or_b32 s3, s3, s7
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_and_b32 s3, s3, 31
|
||||
; VI-NEXT: s_sub_i32 s11, 32, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: s_sub_i32 s3, 32, s2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: s_sub_i32 s3, 32, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; VI-NEXT: s_lshl_b32 s2, s5, s1
|
||||
; VI-NEXT: s_lshr_b32 s3, s9, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; VI-NEXT: s_or_b32 s2, s2, s3
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: s_sub_i32 s2, 32, s1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_sub_i32 s2, 32, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: s_lshl_b32 s1, s4, s0
|
||||
; VI-NEXT: s_lshr_b32 s2, s8, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: s_or_b32 s1, s1, s2
|
||||
; VI-NEXT: s_sub_i32 s1, 32, s0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s12
|
||||
@ -435,40 +412,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX9-NEXT: s_sub_i32 s15, 32, s3
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_lshl_b32 s14, s7, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s11, s11, s15
|
||||
; GFX9-NEXT: s_sub_i32 s7, 32, s2
|
||||
; GFX9-NEXT: s_or_b32 s11, s14, s11
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; GFX9-NEXT: s_lshl_b32 s3, s6, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s7, s10, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX9-NEXT: s_or_b32 s3, s3, s7
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX9-NEXT: s_sub_i32 s11, 32, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: s_sub_i32 s3, 32, s2
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_sub_i32 s3, 32, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; GFX9-NEXT: s_lshl_b32 s2, s5, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s9, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX9-NEXT: s_or_b32 s2, s2, s3
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: s_sub_i32 s2, 32, s1
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_sub_i32 s2, 32, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s4, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s8, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_or_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_sub_i32 s1, 32, s0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s12
|
||||
@ -478,44 +451,37 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
;
|
||||
; R600-LABEL: fshl_v4i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 34, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
|
||||
; R600-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT T0.W, KC0[5].Y, literal.x,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
|
||||
; R600-NEXT: AND_INT T0.W, KC0[6].X, literal.x,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[5].W, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHR T0.Z, KC0[4].Y, PV.W,
|
||||
; R600-NEXT: SUB_INT T2.W, literal.x, T1.W,
|
||||
; R600-NEXT: AND_INT * T3.W, KC0[5].W, literal.y,
|
||||
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
|
||||
; R600-NEXT: AND_INT T0.Y, KC0[5].Z, literal.x,
|
||||
; R600-NEXT: SUB_INT T1.Z, literal.y, PS,
|
||||
; R600-NEXT: LSHR * T2.W, KC0[5].X, PV.W,
|
||||
; R600-NEXT: AND_INT T0.X, KC0[5].Y, literal.x,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[4].X, KC0[5].X, PV.W,
|
||||
; R600-NEXT: SETE_INT T0.Z, T0.W, 0.0,
|
||||
; R600-NEXT: SUB_INT * T0.W, literal.y, T1.W,
|
||||
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
|
||||
; R600-NEXT: LSHL * T4.W, KC0[4].X, T1.W,
|
||||
; R600-NEXT: OR_INT T0.X, PV.W, T2.W,
|
||||
; R600-NEXT: SETE_INT T1.Y, T1.W, 0.0,
|
||||
; R600-NEXT: LSHR T1.Z, KC0[4].W, T1.Z,
|
||||
; R600-NEXT: LSHL T1.W, KC0[3].W, T3.W, BS:VEC_021/SCL_122
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, T0.Y,
|
||||
; R600-NEXT: AND_INT * T2.W, KC0[5].Z, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT T1.Y, literal.x, PV.W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, T0.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, T1.W, 0.0,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHR T1.X, KC0[4].Z, PS,
|
||||
; R600-NEXT: LSHL T2.Y, KC0[3].Z, T0.Y,
|
||||
; R600-NEXT: OR_INT T1.Z, PV.W, PV.Z,
|
||||
; R600-NEXT: SETE_INT * T1.W, T3.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT * T2.W, T1.Y, T0.X, KC0[4].X,
|
||||
; R600-NEXT: LSHL T1.Y, KC0[3].Y, T0.W,
|
||||
; R600-NEXT: CNDE_INT T2.Z, T1.W, T1.Z, KC0[3].W,
|
||||
; R600-NEXT: OR_INT T1.W, T2.Y, T1.X,
|
||||
; R600-NEXT: SETE_INT * T3.W, T0.Y, 0.0,
|
||||
; R600-NEXT: CNDE_INT T2.Y, PS, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: OR_INT T1.W, PV.Y, T0.Z,
|
||||
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T2.X, PS, PV.W, KC0[3].Y,
|
||||
; R600-NEXT: CNDE_INT * T1.W, T0.Z, T0.Y, KC0[4].X,
|
||||
; R600-NEXT: CNDE_INT T1.Z, T0.W, T1.Z, KC0[3].W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Z, KC0[4].Z, T1.Y,
|
||||
; R600-NEXT: SETE_INT * T2.W, T2.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: SUB_INT * T0.W, literal.x, T0.X,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Y, KC0[4].Y, PV.W,
|
||||
; R600-NEXT: SETE_INT * T2.W, T0.X, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
|
||||
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
|
@ -128,24 +128,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_sub_i32 s11, 32, s1
|
||||
; SI-NEXT: s_lshr_b32 s10, s9, s1
|
||||
; SI-NEXT: s_lshl_b32 s3, s3, s11
|
||||
; SI-NEXT: s_or_b32 s3, s3, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s0
|
||||
; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: s_lshr_b32 s1, s8, s0
|
||||
; SI-NEXT: s_lshl_b32 s2, s2, s3
|
||||
; SI-NEXT: s_or_b32 s1, s2, s1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s8
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; SI-NEXT: v_alignbit_b32 v2, s2, v0, v2
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -156,24 +150,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_sub_i32 s9, 32, s1
|
||||
; VI-NEXT: s_lshr_b32 s8, s7, s1
|
||||
; VI-NEXT: s_lshl_b32 s5, s5, s9
|
||||
; VI-NEXT: s_or_b32 s5, s5, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; VI-NEXT: s_sub_i32 s5, 32, s0
|
||||
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: s_lshr_b32 s1, s6, s0
|
||||
; VI-NEXT: s_lshl_b32 s4, s4, s5
|
||||
; VI-NEXT: s_or_b32 s1, s4, s1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: v_alignbit_b32 v2, s4, v0, v2
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
@ -186,24 +174,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_sub_i32 s9, 32, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s8, s7, s1
|
||||
; GFX9-NEXT: s_lshl_b32 s5, s5, s9
|
||||
; GFX9-NEXT: s_or_b32 s5, s5, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: s_sub_i32 s5, 32, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s6, s0
|
||||
; GFX9-NEXT: s_lshl_b32 s4, s4, s5
|
||||
; GFX9-NEXT: s_or_b32 s1, s4, s1
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v2
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
|
||||
@ -211,29 +193,22 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
|
||||
;
|
||||
; R600-LABEL: fshr_v2i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
|
||||
; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[3].W, literal.x,
|
||||
; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHL T0.Z, KC0[3].X, PV.W,
|
||||
; R600-NEXT: LSHR T2.W, KC0[3].Z, T0.W, BS:VEC_021/SCL_122
|
||||
; R600-NEXT: SUB_INT * T3.W, literal.x, T1.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHL T0.Y, KC0[2].W, PS,
|
||||
; R600-NEXT: LSHR T1.Z, KC0[3].Y, T1.W,
|
||||
; R600-NEXT: OR_INT T2.W, PV.Z, PV.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: OR_INT T0.W, PV.Y, PV.Z,
|
||||
; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
|
||||
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
|
||||
@ -316,42 +291,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s15
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: s_sub_i32 s17, 32, s3
|
||||
; SI-NEXT: s_lshr_b32 s16, s15, s3
|
||||
; SI-NEXT: s_lshl_b32 s11, s11, s17
|
||||
; SI-NEXT: s_or_b32 s11, s11, s16
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; SI-NEXT: s_sub_i32 s11, 32, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; SI-NEXT: s_lshr_b32 s3, s14, s2
|
||||
; SI-NEXT: s_lshl_b32 s10, s10, s11
|
||||
; SI-NEXT: s_or_b32 s3, s10, s3
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s1
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s14
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; SI-NEXT: s_lshr_b32 s2, s13, s1
|
||||
; SI-NEXT: s_lshl_b32 s3, s9, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s14
|
||||
; SI-NEXT: s_or_b32 s2, s3, s2
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SI-NEXT: s_and_b32 s0, s0, 31
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s0
|
||||
; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; SI-NEXT: s_lshr_b32 s1, s12, s0
|
||||
; SI-NEXT: s_lshl_b32 s2, s8, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SI-NEXT: s_or_b32 s1, s2, s1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s12
|
||||
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; SI-NEXT: v_alignbit_b32 v4, s8, v0, v4
|
||||
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -362,42 +325,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; VI-NEXT: s_and_b32 s3, s3, 31
|
||||
; VI-NEXT: s_sub_i32 s15, 32, s3
|
||||
; VI-NEXT: s_lshr_b32 s14, s11, s3
|
||||
; VI-NEXT: s_lshl_b32 s7, s7, s15
|
||||
; VI-NEXT: s_or_b32 s7, s7, s14
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: s_sub_i32 s7, 32, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; VI-NEXT: s_lshr_b32 s3, s10, s2
|
||||
; VI-NEXT: s_lshl_b32 s6, s6, s7
|
||||
; VI-NEXT: s_or_b32 s3, s6, s3
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: s_sub_i32 s3, 32, s1
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; VI-NEXT: s_lshr_b32 s2, s9, s1
|
||||
; VI-NEXT: s_lshl_b32 s3, s5, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; VI-NEXT: s_or_b32 s2, s3, s2
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_sub_i32 s2, 32, s0
|
||||
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; VI-NEXT: s_lshr_b32 s1, s8, s0
|
||||
; VI-NEXT: s_lshl_b32 s2, s4, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VI-NEXT: s_or_b32 s1, s2, s1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s8
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4
|
||||
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s12
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s13
|
||||
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
@ -410,42 +361,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX9-NEXT: s_sub_i32 s15, 32, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s14, s11, s3
|
||||
; GFX9-NEXT: s_lshl_b32 s7, s7, s15
|
||||
; GFX9-NEXT: s_or_b32 s7, s7, s14
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: s_sub_i32 s7, 32, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s10, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s6, s6, s7
|
||||
; GFX9-NEXT: s_or_b32 s3, s6, s3
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_sub_i32 s3, 32, s1
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s9, s1
|
||||
; GFX9-NEXT: s_lshl_b32 s3, s5, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GFX9-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_sub_i32 s2, 32, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s8, s0
|
||||
; GFX9-NEXT: s_lshl_b32 s2, s4, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX9-NEXT: s_or_b32 s1, s2, s1
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
|
||||
@ -453,44 +392,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
|
||||
;
|
||||
; R600-LABEL: fshr_v4i32:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 34, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
|
||||
; R600-NEXT: ALU 20, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: AND_INT T0.W, KC0[5].Y, literal.x,
|
||||
; R600-NEXT: AND_INT T0.W, KC0[5].Z, literal.x,
|
||||
; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHL T0.Z, KC0[3].Y, PV.W,
|
||||
; R600-NEXT: SUB_INT T2.W, literal.x, T1.W,
|
||||
; R600-NEXT: AND_INT * T3.W, KC0[5].W, literal.y,
|
||||
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
|
||||
; R600-NEXT: AND_INT T0.Y, KC0[5].Z, literal.x,
|
||||
; R600-NEXT: SUB_INT T1.Z, literal.y, PS,
|
||||
; R600-NEXT: LSHL * T2.W, KC0[4].X, PV.W,
|
||||
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
|
||||
; R600-NEXT: LSHR * T4.W, KC0[5].X, T1.W,
|
||||
; R600-NEXT: OR_INT T0.X, T2.W, PV.W,
|
||||
; R600-NEXT: SETE_INT T1.Y, T1.W, 0.0, BS:VEC_120/SCL_212
|
||||
; R600-NEXT: LSHL T1.Z, KC0[3].W, T1.Z,
|
||||
; R600-NEXT: LSHR T1.W, KC0[4].W, T3.W, BS:VEC_021/SCL_122
|
||||
; R600-NEXT: SUB_INT * T2.W, literal.x, T0.Y,
|
||||
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: LSHL T1.X, KC0[3].Z, PS,
|
||||
; R600-NEXT: LSHR T2.Y, KC0[4].Z, T0.Y,
|
||||
; R600-NEXT: OR_INT T1.Z, PV.Z, PV.W,
|
||||
; R600-NEXT: SETE_INT * T1.W, T3.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT * T2.W, T1.Y, T0.X, KC0[5].X,
|
||||
; R600-NEXT: LSHR T1.Y, KC0[4].Y, T0.W,
|
||||
; R600-NEXT: CNDE_INT T2.Z, T1.W, T1.Z, KC0[4].W,
|
||||
; R600-NEXT: OR_INT T1.W, T1.X, T2.Y,
|
||||
; R600-NEXT: SETE_INT * T3.W, T0.Y, 0.0,
|
||||
; R600-NEXT: CNDE_INT T2.Y, PS, PV.W, KC0[4].Z,
|
||||
; R600-NEXT: OR_INT T1.W, T0.Z, PV.Y,
|
||||
; R600-NEXT: SETE_INT T0.Z, PS, 0.0,
|
||||
; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
|
||||
; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: SETE_INT T1.Z, PV.W, 0.0,
|
||||
; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
|
||||
; R600-NEXT: CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
|
||||
; R600-NEXT: CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T2.X, PS, PV.W, KC0[4].Y,
|
||||
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
|
||||
; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x,
|
||||
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
|
||||
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
|
||||
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
|
||||
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
|
@ -8,7 +8,6 @@ define i64 @test1(i64 %hi, i64 %lo, i64 %bits) nounwind {
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: movq %rdx, %rcx
|
||||
; CHECK-NEXT: movq %rdi, %rax
|
||||
; CHECK-NEXT: andl $63, %ecx
|
||||
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; CHECK-NEXT: shldq %cl, %rsi, %rax
|
||||
; CHECK-NEXT: retq
|
||||
@ -25,7 +24,6 @@ define i64 @test2(i64 %hi, i64 %lo, i64 %bits) nounwind {
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: movq %rdx, %rcx
|
||||
; CHECK-NEXT: movq %rsi, %rax
|
||||
; CHECK-NEXT: andl $63, %ecx
|
||||
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
|
||||
; CHECK-NEXT: shrdq %cl, %rdi, %rax
|
||||
; CHECK-NEXT: retq
|
||||
|
@ -290,11 +290,9 @@ define i64 @test10(i64 %val, i32 %bits) nounwind {
|
||||
define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
|
||||
; X86-LABEL: test11:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: andl $31, %ecx
|
||||
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X86-NEXT: shldl %cl, %edx, %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
@ -302,7 +300,6 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movl %edx, %ecx
|
||||
; X64-NEXT: movl %edi, %eax
|
||||
; X64-NEXT: andl $31, %ecx
|
||||
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-NEXT: shldl %cl, %esi, %eax
|
||||
; X64-NEXT: retq
|
||||
@ -317,11 +314,9 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
|
||||
define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
|
||||
; X86-LABEL: test12:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: andl $31, %ecx
|
||||
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X86-NEXT: shrdl %cl, %edx, %eax
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
@ -329,7 +324,6 @@ define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movl %edx, %ecx
|
||||
; X64-NEXT: movl %esi, %eax
|
||||
; X64-NEXT: andl $31, %ecx
|
||||
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
|
||||
; X64-NEXT: shrdl %cl, %edi, %eax
|
||||
; X64-NEXT: retq
|
||||
|
Loading…
x
Reference in New Issue
Block a user