1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[DAG] MatchRotate - Add funnel shift by variable support

Followup to D75114, this patch reuses the existing MatchRotate ROTL/ROTR rotation pattern code to also recognize the more general FSHL/FSHR funnel shift patterns when we have variable shift amounts, matched with MatchFunnelPosNeg which acts in an (almost) equivalent manner to MatchRotatePosNeg.
This commit is contained in:
Simon Pilgrim 2020-03-15 11:49:30 +00:00
parent 51df7978a6
commit 1f827bf828
5 changed files with 315 additions and 389 deletions

View File

@ -555,6 +555,10 @@ namespace {
SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);
SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
SDValue InnerPos, SDValue InnerNeg,
unsigned PosOpcode, unsigned NegOpcode,
const SDLoc &DL);
SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
SDValue MatchLoadCombine(SDNode *N);
SDValue MatchStoreCombine(StoreSDNode *N);
@ -6319,6 +6323,33 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
return SDValue();
}
// A subroutine of MatchRotate used once we have found an OR of two opposite
// shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
// former being preferred if supported. InnerPos and InnerNeg are Pos and
// Neg with outer conversions stripped away.
// TODO: Merge with MatchRotatePosNeg.
SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
SDValue Neg, SDValue InnerPos,
SDValue InnerNeg, unsigned PosOpcode,
unsigned NegOpcode, const SDLoc &DL) {
// fold (or (shl x0, (*ext y)),
// (srl x1, (*ext (sub 32, y)))) ->
// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
//
// fold (or (shl x0, (*ext (sub 32, y))),
// (srl x1, (*ext y))) ->
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
EVT VT = N0.getValueType();
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
HasPos ? Pos : Neg);
}
return SDValue();
}
// MatchRotate - Handle an 'or' of two operands. If this is one of the many
// idioms for rotate, and if the target supports rotation instructions, generate
// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
@ -6444,10 +6475,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
return Res;
}
// TODO: Handle variable funnel shifts.
if (!IsRotate)
return SDValue();
// If there is a mask here, and we have a variable shift, we can't be sure
// that we're masking out the right stuff.
if (LHSMask.getNode() || RHSMask.getNode())
@ -6468,13 +6495,29 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
RExtOp0 = RHSShiftAmt.getOperand(0);
}
SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
if (IsRotate && (HasROTL || HasROTR)) {
SDValue TryL =
MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
RExtOp0, ISD::ROTL, ISD::ROTR, DL);
if (TryL)
return TryL;
SDValue TryR =
MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
LExtOp0, ISD::ROTR, ISD::ROTL, DL);
if (TryR)
return TryR;
}
SDValue TryL =
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
if (TryL)
return TryL;
SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
SDValue TryR =
MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
if (TryR)
return TryR;

View File

@ -17,11 +17,10 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s2, s2, 31
; SI-NEXT: s_sub_i32 s8, 32, s2
; SI-NEXT: s_lshl_b32 s3, s0, s2
; SI-NEXT: s_lshr_b32 s1, s1, s8
; SI-NEXT: s_or_b32 s1, s3, s1
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: s_sub_i32 s1, 32, s2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
@ -34,14 +33,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 31
; VI-NEXT: s_sub_i32 s3, 32, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_lshl_b32 s0, s0, s2
; VI-NEXT: s_lshr_b32 s1, s1, s3
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: s_sub_i32 s1, 32, s2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@ -53,14 +51,13 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s2, 31
; GFX9-NEXT: s_sub_i32 s3, 32, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s1, s1, s3
; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_sub_i32 s1, 32, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dword v[0:1], v2, off
@ -68,7 +65,7 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
;
; R600-LABEL: fshl_i32:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
@ -77,9 +74,7 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T1.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHR T1.W, KC0[2].W, PV.W,
; R600-NEXT: LSHL * T2.W, KC0[2].Z, T0.W,
; R600-NEXT: OR_INT * T1.W, PS, PV.W,
; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[2].Z, KC0[2].W, PV.W,
; R600-NEXT: CNDE_INT T0.X, T0.W, KC0[2].Z, PV.W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
@ -153,22 +148,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: s_sub_i32 s11, 32, s1
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: s_lshl_b32 s10, s3, s1
; SI-NEXT: s_lshr_b32 s9, s9, s11
; SI-NEXT: s_sub_i32 s3, 32, s0
; SI-NEXT: s_or_b32 s9, s10, s9
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; SI-NEXT: s_lshl_b32 s1, s2, s0
; SI-NEXT: s_lshr_b32 s3, s8, s3
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_or_b32 s1, s1, s3
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: s_sub_i32 s10, 32, s1
; SI-NEXT: v_mov_b32_e32 v1, s10
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_alignbit_b32 v0, s3, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: s_sub_i32 s1, 32, s0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@ -181,22 +174,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: s_sub_i32 s9, 32, s1
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: s_lshl_b32 s8, s5, s1
; VI-NEXT: s_lshr_b32 s7, s7, s9
; VI-NEXT: s_sub_i32 s5, 32, s0
; VI-NEXT: s_or_b32 s7, s8, s7
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; VI-NEXT: s_lshl_b32 s1, s4, s0
; VI-NEXT: s_lshr_b32 s5, s6, s5
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_or_b32 s1, s1, s5
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: s_sub_i32 s7, 32, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_sub_i32 s1, 32, s0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
@ -211,22 +202,20 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: s_sub_i32 s9, 32, s1
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: s_lshl_b32 s8, s5, s1
; GFX9-NEXT: s_lshr_b32 s7, s7, s9
; GFX9-NEXT: s_sub_i32 s5, 32, s0
; GFX9-NEXT: s_or_b32 s7, s8, s7
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; GFX9-NEXT: s_lshl_b32 s1, s4, s0
; GFX9-NEXT: s_lshr_b32 s5, s6, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_or_b32 s1, s1, s5
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: s_sub_i32 s7, 32, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_sub_i32 s1, 32, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@ -236,29 +225,25 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
;
; R600-LABEL: fshl_v2i32:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
; R600-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
; R600-NEXT: AND_INT * T1.W, KC0[3].W, literal.x,
; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHR T0.Z, KC0[3].Z, PV.W,
; R600-NEXT: LSHL T2.W, KC0[3].X, T0.W, BS:VEC_021/SCL_122
; R600-NEXT: SUB_INT * T3.W, literal.x, T1.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHR T0.Y, KC0[3].Y, PS,
; R600-NEXT: LSHL T1.Z, KC0[2].W, T1.W,
; R600-NEXT: OR_INT T2.W, PV.W, PV.Z,
; R600-NEXT: SUB_INT T1.W, literal.x, PV.W,
; R600-NEXT: AND_INT * T2.W, KC0[3].W, literal.y,
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
; R600-NEXT: SUB_INT T0.Z, literal.x, PS,
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].X,
; R600-NEXT: OR_INT T0.W, PV.Z, PV.Y,
; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[2].W,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].X,
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].W, KC0[3].Y, PV.Z,
; R600-NEXT: SETE_INT * T1.W, T2.W, 0.0,
; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[2].W,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@ -341,40 +326,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: s_and_b32 s3, s3, 31
; SI-NEXT: s_sub_i32 s17, 32, s3
; SI-NEXT: s_and_b32 s2, s2, 31
; SI-NEXT: s_lshl_b32 s16, s11, s3
; SI-NEXT: s_lshr_b32 s15, s15, s17
; SI-NEXT: s_sub_i32 s11, 32, s2
; SI-NEXT: s_or_b32 s15, s16, s15
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; SI-NEXT: s_lshl_b32 s3, s10, s2
; SI-NEXT: s_lshr_b32 s11, s14, s11
; SI-NEXT: v_mov_b32_e32 v0, s15
; SI-NEXT: s_or_b32 s3, s3, s11
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: s_and_b32 s3, s3, 31
; SI-NEXT: s_sub_i32 s16, 32, s3
; SI-NEXT: v_mov_b32_e32 v1, s16
; SI-NEXT: s_and_b32 s2, s2, 31
; SI-NEXT: v_alignbit_b32 v0, s11, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: s_sub_i32 s3, 32, s2
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_sub_i32 s3, 32, s1
; SI-NEXT: v_mov_b32_e32 v0, s14
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: v_alignbit_b32 v0, s10, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; SI-NEXT: s_lshl_b32 s2, s9, s1
; SI-NEXT: s_lshr_b32 s3, s13, s3
; SI-NEXT: v_mov_b32_e32 v1, s10
; SI-NEXT: s_or_b32 s2, s2, s3
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: s_sub_i32 s2, 32, s1
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_sub_i32 s2, 32, s0
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_alignbit_b32 v0, s9, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; SI-NEXT: s_lshl_b32 s1, s8, s0
; SI-NEXT: s_lshr_b32 s2, s12, s2
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_or_b32 s1, s1, s2
; SI-NEXT: s_sub_i32 s1, 32, s0
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v4, s1
; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@ -387,40 +368,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_and_b32 s3, s3, 31
; VI-NEXT: s_sub_i32 s15, 32, s3
; VI-NEXT: s_and_b32 s2, s2, 31
; VI-NEXT: s_lshl_b32 s14, s7, s3
; VI-NEXT: s_lshr_b32 s11, s11, s15
; VI-NEXT: s_sub_i32 s7, 32, s2
; VI-NEXT: s_or_b32 s11, s14, s11
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; VI-NEXT: s_lshl_b32 s3, s6, s2
; VI-NEXT: s_lshr_b32 s7, s10, s7
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_or_b32 s3, s3, s7
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: s_and_b32 s3, s3, 31
; VI-NEXT: s_sub_i32 s11, 32, s3
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: s_and_b32 s2, s2, 31
; VI-NEXT: v_alignbit_b32 v0, s7, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: s_sub_i32 s3, 32, s2
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_sub_i32 s3, 32, s1
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: v_alignbit_b32 v0, s6, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; VI-NEXT: s_lshl_b32 s2, s5, s1
; VI-NEXT: s_lshr_b32 s3, s9, s3
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: s_sub_i32 s2, 32, s1
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_sub_i32 s2, 32, s0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_alignbit_b32 v0, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; VI-NEXT: s_lshl_b32 s1, s4, s0
; VI-NEXT: s_lshr_b32 s2, s8, s2
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_or_b32 s1, s1, s2
; VI-NEXT: s_sub_i32 s1, 32, s0
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; VI-NEXT: v_mov_b32_e32 v4, s12
@ -435,40 +412,36 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_and_b32 s3, s3, 31
; GFX9-NEXT: s_sub_i32 s15, 32, s3
; GFX9-NEXT: s_and_b32 s2, s2, 31
; GFX9-NEXT: s_lshl_b32 s14, s7, s3
; GFX9-NEXT: s_lshr_b32 s11, s11, s15
; GFX9-NEXT: s_sub_i32 s7, 32, s2
; GFX9-NEXT: s_or_b32 s11, s14, s11
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; GFX9-NEXT: s_lshl_b32 s3, s6, s2
; GFX9-NEXT: s_lshr_b32 s7, s10, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: s_or_b32 s3, s3, s7
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: s_and_b32 s3, s3, 31
; GFX9-NEXT: s_sub_i32 s11, 32, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s11
; GFX9-NEXT: s_and_b32 s2, s2, 31
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_sub_i32 s3, 32, s2
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s3, 32, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; GFX9-NEXT: s_lshl_b32 s2, s5, s1
; GFX9-NEXT: s_lshr_b32 s3, s9, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: s_sub_i32 s2, 32, s1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_sub_i32 s2, 32, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; GFX9-NEXT: s_lshl_b32 s1, s4, s0
; GFX9-NEXT: s_lshr_b32 s2, s8, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_or_b32 s1, s1, s2
; GFX9-NEXT: s_sub_i32 s1, 32, s0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
@ -478,44 +451,37 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; R600-LABEL: fshl_v4i32:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 34, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; R600-NEXT: ALU 27, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: AND_INT T0.W, KC0[5].Y, literal.x,
; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
; R600-NEXT: AND_INT T0.W, KC0[6].X, literal.x,
; R600-NEXT: AND_INT * T1.W, KC0[5].W, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHR T0.Z, KC0[4].Y, PV.W,
; R600-NEXT: SUB_INT T2.W, literal.x, T1.W,
; R600-NEXT: AND_INT * T3.W, KC0[5].W, literal.y,
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
; R600-NEXT: AND_INT T0.Y, KC0[5].Z, literal.x,
; R600-NEXT: SUB_INT T1.Z, literal.y, PS,
; R600-NEXT: LSHR * T2.W, KC0[5].X, PV.W,
; R600-NEXT: AND_INT T0.X, KC0[5].Y, literal.x,
; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[4].X, KC0[5].X, PV.W,
; R600-NEXT: SETE_INT T0.Z, T0.W, 0.0,
; R600-NEXT: SUB_INT * T0.W, literal.y, T1.W,
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
; R600-NEXT: LSHL * T4.W, KC0[4].X, T1.W,
; R600-NEXT: OR_INT T0.X, PV.W, T2.W,
; R600-NEXT: SETE_INT T1.Y, T1.W, 0.0,
; R600-NEXT: LSHR T1.Z, KC0[4].W, T1.Z,
; R600-NEXT: LSHL T1.W, KC0[3].W, T3.W, BS:VEC_021/SCL_122
; R600-NEXT: SUB_INT * T2.W, literal.x, T0.Y,
; R600-NEXT: AND_INT * T2.W, KC0[5].Z, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT T1.Y, literal.x, PV.W,
; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, T0.W,
; R600-NEXT: SETE_INT * T0.W, T1.W, 0.0,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHR T1.X, KC0[4].Z, PS,
; R600-NEXT: LSHL T2.Y, KC0[3].Z, T0.Y,
; R600-NEXT: OR_INT T1.Z, PV.W, PV.Z,
; R600-NEXT: SETE_INT * T1.W, T3.W, 0.0,
; R600-NEXT: CNDE_INT * T2.W, T1.Y, T0.X, KC0[4].X,
; R600-NEXT: LSHL T1.Y, KC0[3].Y, T0.W,
; R600-NEXT: CNDE_INT T2.Z, T1.W, T1.Z, KC0[3].W,
; R600-NEXT: OR_INT T1.W, T2.Y, T1.X,
; R600-NEXT: SETE_INT * T3.W, T0.Y, 0.0,
; R600-NEXT: CNDE_INT T2.Y, PS, PV.W, KC0[3].Z,
; R600-NEXT: OR_INT T1.W, PV.Y, T0.Z,
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
; R600-NEXT: CNDE_INT T2.X, PS, PV.W, KC0[3].Y,
; R600-NEXT: CNDE_INT * T1.W, T0.Z, T0.Y, KC0[4].X,
; R600-NEXT: CNDE_INT T1.Z, T0.W, T1.Z, KC0[3].W,
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Z, KC0[4].Z, T1.Y,
; R600-NEXT: SETE_INT * T2.W, T2.W, 0.0,
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
; R600-NEXT: SUB_INT * T0.W, literal.x, T0.X,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[3].Y, KC0[4].Y, PV.W,
; R600-NEXT: SETE_INT * T2.W, T0.X, 0.0,
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:

View File

@ -128,24 +128,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_mov_b32_e32 v0, s9
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: s_sub_i32 s11, 32, s1
; SI-NEXT: s_lshr_b32 s10, s9, s1
; SI-NEXT: s_lshl_b32 s3, s3, s11
; SI-NEXT: s_or_b32 s3, s3, s10
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_sub_i32 s3, 32, s0
; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; SI-NEXT: s_lshr_b32 s1, s8, s0
; SI-NEXT: s_lshl_b32 s2, s2, s3
; SI-NEXT: s_or_b32 s1, s2, s1
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v2, s8
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s8
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: v_alignbit_b32 v2, s2, v0, v2
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@ -156,24 +150,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: s_sub_i32 s9, 32, s1
; VI-NEXT: s_lshr_b32 s8, s7, s1
; VI-NEXT: s_lshl_b32 s5, s5, s9
; VI-NEXT: s_or_b32 s5, s5, s8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: s_sub_i32 s5, 32, s0
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; VI-NEXT: s_lshr_b32 s1, s6, s0
; VI-NEXT: s_lshl_b32 s4, s4, s5
; VI-NEXT: s_or_b32 s1, s4, s1
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_alignbit_b32 v2, s4, v0, v2
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@ -186,24 +174,18 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: s_sub_i32 s9, 32, s1
; GFX9-NEXT: s_lshr_b32 s8, s7, s1
; GFX9-NEXT: s_lshl_b32 s5, s5, s9
; GFX9-NEXT: s_or_b32 s5, s5, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: s_sub_i32 s5, 32, s0
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; GFX9-NEXT: s_lshr_b32 s1, s6, s0
; GFX9-NEXT: s_lshl_b32 s4, s4, s5
; GFX9-NEXT: s_or_b32 s1, s4, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
@ -211,29 +193,22 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
;
; R600-LABEL: fshr_v2i32:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 18, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: AND_INT T0.W, KC0[4].X, literal.x,
; R600-NEXT: AND_INT * T1.W, KC0[3].W, literal.x,
; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHL T0.Z, KC0[3].X, PV.W,
; R600-NEXT: LSHR T2.W, KC0[3].Z, T0.W, BS:VEC_021/SCL_122
; R600-NEXT: SUB_INT * T3.W, literal.x, T1.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHL T0.Y, KC0[2].W, PS,
; R600-NEXT: LSHR T1.Z, KC0[3].Y, T1.W,
; R600-NEXT: OR_INT T2.W, PV.Z, PV.W,
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[3].Z,
; R600-NEXT: OR_INT T0.W, PV.Y, PV.Z,
; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[3].Y,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W,
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z,
; R600-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W,
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y,
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
@ -316,42 +291,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s15
; SI-NEXT: v_mov_b32_e32 v0, s15
; SI-NEXT: s_and_b32 s3, s3, 31
; SI-NEXT: s_sub_i32 s17, 32, s3
; SI-NEXT: s_lshr_b32 s16, s15, s3
; SI-NEXT: s_lshl_b32 s11, s11, s17
; SI-NEXT: s_or_b32 s11, s11, s16
; SI-NEXT: s_and_b32 s2, s2, 31
; SI-NEXT: v_mov_b32_e32 v0, s11
; SI-NEXT: s_sub_i32 s11, 32, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; SI-NEXT: s_lshr_b32 s3, s14, s2
; SI-NEXT: s_lshl_b32 s10, s10, s11
; SI-NEXT: s_or_b32 s3, s10, s3
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: s_sub_i32 s3, 32, s1
; SI-NEXT: s_and_b32 s2, s2, 31
; SI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s14
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; SI-NEXT: s_lshr_b32 s2, s13, s1
; SI-NEXT: s_lshl_b32 s3, s9, s3
; SI-NEXT: v_mov_b32_e32 v1, s14
; SI-NEXT: s_or_b32 s2, s3, s2
; SI-NEXT: s_and_b32 s1, s1, 31
; SI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_and_b32 s0, s0, 31
; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_sub_i32 s2, 32, s0
; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; SI-NEXT: s_lshr_b32 s1, s12, s0
; SI-NEXT: s_lshl_b32 s2, s8, s2
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: s_or_b32 s1, s2, s1
; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v4, s12
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: v_mov_b32_e32 v4, s0
; SI-NEXT: v_alignbit_b32 v4, s8, v0, v4
; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@ -362,42 +325,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_and_b32 s3, s3, 31
; VI-NEXT: s_sub_i32 s15, 32, s3
; VI-NEXT: s_lshr_b32 s14, s11, s3
; VI-NEXT: s_lshl_b32 s7, s7, s15
; VI-NEXT: s_or_b32 s7, s7, s14
; VI-NEXT: s_and_b32 s2, s2, 31
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_sub_i32 s7, 32, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; VI-NEXT: s_lshr_b32 s3, s10, s2
; VI-NEXT: s_lshl_b32 s6, s6, s7
; VI-NEXT: s_or_b32 s3, s6, s3
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_sub_i32 s3, 32, s1
; VI-NEXT: s_and_b32 s2, s2, 31
; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; VI-NEXT: s_lshr_b32 s2, s9, s1
; VI-NEXT: s_lshl_b32 s3, s5, s3
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_and_b32 s1, s1, 31
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b32 s0, s0, 31
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_sub_i32 s2, 32, s0
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; VI-NEXT: s_lshr_b32 s1, s8, s0
; VI-NEXT: s_lshl_b32 s2, s4, s2
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: s_or_b32 s1, s2, s1
; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4
; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_mov_b32_e32 v5, s13
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
@ -410,42 +361,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s11
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: s_and_b32 s3, s3, 31
; GFX9-NEXT: s_sub_i32 s15, 32, s3
; GFX9-NEXT: s_lshr_b32 s14, s11, s3
; GFX9-NEXT: s_lshl_b32 s7, s7, s15
; GFX9-NEXT: s_or_b32 s7, s7, s14
; GFX9-NEXT: s_and_b32 s2, s2, 31
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_sub_i32 s7, 32, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
; GFX9-NEXT: s_lshr_b32 s3, s10, s2
; GFX9-NEXT: s_lshl_b32 s6, s6, s7
; GFX9-NEXT: s_or_b32 s3, s6, s3
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s3, 32, s1
; GFX9-NEXT: s_and_b32 s2, s2, 31
; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
; GFX9-NEXT: s_lshr_b32 s2, s9, s1
; GFX9-NEXT: s_lshl_b32 s3, s5, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s10
; GFX9-NEXT: s_or_b32 s2, s3, s2
; GFX9-NEXT: s_and_b32 s1, s1, 31
; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_and_b32 s0, s0, 31
; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_sub_i32 s2, 32, s0
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0
; GFX9-NEXT: s_lshr_b32 s1, s8, s0
; GFX9-NEXT: s_lshl_b32 s2, s4, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_or_b32 s1, s2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s8
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
@ -453,44 +392,30 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; R600-LABEL: fshr_v4i32:
; R600: ; %bb.0: ; %entry
; R600-NEXT: ALU 34, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
; R600-NEXT: ALU 20, @4, KC0[CB0:0-32], KC1[]
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; R600-NEXT: CF_END
; R600-NEXT: PAD
; R600-NEXT: ALU clause starting at 4:
; R600-NEXT: AND_INT T0.W, KC0[5].Y, literal.x,
; R600-NEXT: AND_INT T0.W, KC0[5].Z, literal.x,
; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SUB_INT * T2.W, literal.x, PV.W,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHL T0.Z, KC0[3].Y, PV.W,
; R600-NEXT: SUB_INT T2.W, literal.x, T1.W,
; R600-NEXT: AND_INT * T3.W, KC0[5].W, literal.y,
; R600-NEXT: 32(4.484155e-44), 31(4.344025e-44)
; R600-NEXT: AND_INT T0.Y, KC0[5].Z, literal.x,
; R600-NEXT: SUB_INT T1.Z, literal.y, PS,
; R600-NEXT: LSHL * T2.W, KC0[4].X, PV.W,
; R600-NEXT: 31(4.344025e-44), 32(4.484155e-44)
; R600-NEXT: LSHR * T4.W, KC0[5].X, T1.W,
; R600-NEXT: OR_INT T0.X, T2.W, PV.W,
; R600-NEXT: SETE_INT T1.Y, T1.W, 0.0, BS:VEC_120/SCL_212
; R600-NEXT: LSHL T1.Z, KC0[3].W, T1.Z,
; R600-NEXT: LSHR T1.W, KC0[4].W, T3.W, BS:VEC_021/SCL_122
; R600-NEXT: SUB_INT * T2.W, literal.x, T0.Y,
; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; R600-NEXT: LSHL T1.X, KC0[3].Z, PS,
; R600-NEXT: LSHR T2.Y, KC0[4].Z, T0.Y,
; R600-NEXT: OR_INT T1.Z, PV.Z, PV.W,
; R600-NEXT: SETE_INT * T1.W, T3.W, 0.0,
; R600-NEXT: CNDE_INT * T2.W, T1.Y, T0.X, KC0[5].X,
; R600-NEXT: LSHR T1.Y, KC0[4].Y, T0.W,
; R600-NEXT: CNDE_INT T2.Z, T1.W, T1.Z, KC0[4].W,
; R600-NEXT: OR_INT T1.W, T1.X, T2.Y,
; R600-NEXT: SETE_INT * T3.W, T0.Y, 0.0,
; R600-NEXT: CNDE_INT T2.Y, PS, PV.W, KC0[4].Z,
; R600-NEXT: OR_INT T1.W, T0.Z, PV.Y,
; R600-NEXT: SETE_INT T0.Z, PS, 0.0,
; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS,
; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: SETE_INT T1.Z, PV.W, 0.0,
; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W,
; R600-NEXT: CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X,
; R600-NEXT: CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W,
; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W,
; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0,
; R600-NEXT: CNDE_INT T2.X, PS, PV.W, KC0[4].Y,
; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[4].Z,
; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x,
; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W,
; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0,
; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[4].Y,
; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:

View File

@ -8,7 +8,6 @@ define i64 @test1(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: andl $63, %ecx
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shldq %cl, %rsi, %rax
; CHECK-NEXT: retq
@ -25,7 +24,6 @@ define i64 @test2(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: andl $63, %ecx
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shrdq %cl, %rdi, %rax
; CHECK-NEXT: retq

View File

@ -290,11 +290,9 @@ define i64 @test10(i64 %val, i32 %bits) nounwind {
define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X86-LABEL: test11:
; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: andl $31, %ecx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: retl
;
@ -302,7 +300,6 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %edx, %ecx
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $31, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shldl %cl, %esi, %eax
; X64-NEXT: retq
@ -317,11 +314,9 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X86-LABEL: test12:
; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: andl $31, %ecx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shrdl %cl, %edx, %eax
; X86-NEXT: retl
;
@ -329,7 +324,6 @@ define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %edx, %ecx
; X64-NEXT: movl %esi, %eax
; X64-NEXT: andl $31, %ecx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrdl %cl, %edi, %eax
; X64-NEXT: retq