1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[TargetLowering] Improve expansion of FSHL/FSHR

Use an extra shift-by-1 instead of a compare and select to handle the
shift-by-zero case. This sometimes saves one instruction (if the compare
couldn't be combined with a previous instruction). It also works better
on targets that don't have good select instructions.

Note that currently this change doesn't affect most targets because
expandFunnelShift is not used because funnel shift intrinsics are
lowered early in SelectionDAGBuilder. But there is work afoot to change
that; see D77152.

Differential Revision: https://reviews.llvm.org/D77301
This commit is contained in:
Jay Foad 2020-04-02 11:36:01 +01:00
parent d9c7a01b62
commit 68e4349111
3 changed files with 183 additions and 271 deletions

View File

@ -6046,8 +6046,8 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
!isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
return false;
// fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
// fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
// fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
// fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
SDValue X = Node->getOperand(0);
SDValue Y = Node->getOperand(1);
SDValue Z = Node->getOperand(2);
@ -6057,30 +6057,29 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
SDLoc DL(SDValue(Node, 0));
EVT ShVT = Z.getValueType();
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
SDValue Zero = DAG.getConstant(0, DL, ShVT);
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
SDValue ShAmt;
if (isPowerOf2_32(EltSizeInBits)) {
SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
// Z % BW -> Z & (BW - 1)
ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
} else {
SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
}
SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
// If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
// and that is undefined. We must compare and select to avoid UB.
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
// For fshl, 0-shift returns the 1st arg (X).
// For fshr, 0-shift returns the 2nd arg (Y).
SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
SDValue One = DAG.getConstant(1, DL, ShVT);
SDValue ShX, ShY;
if (IsFSHL) {
ShX = DAG.getNode(ISD::SHL, DL, VT, X, ShAmt);
SDValue ShY1 = DAG.getNode(ISD::SRL, DL, VT, Y, One);
ShY = DAG.getNode(ISD::SRL, DL, VT, ShY1, InvShAmt);
} else {
SDValue ShX1 = DAG.getNode(ISD::SHL, DL, VT, X, One);
ShX = DAG.getNode(ISD::SHL, DL, VT, ShX1, InvShAmt);
ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt);
}
Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
return true;
}

View File

@ -65,27 +65,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i16:
; X86-SLOW: # %bb.0:
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: andb $15, %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: movb $16, %cl
; X86-SLOW-NEXT: subb %dl, %cl
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB1_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %esi, %edi
; X86-SLOW-NEXT: movl %edi, %eax
; X86-SLOW-NEXT: .LBB1_2:
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-SLOW-NEXT: andb $15, %cl
; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: shrl %eax
; X86-SLOW-NEXT: xorb $15, %cl
; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i16:
@ -100,17 +89,15 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movzwl %si, %eax
; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shll %cl, %esi
; X64-SLOW-NEXT: movb $16, %cl
; X64-SLOW-NEXT: subb %dl, %cl
; X64-SLOW-NEXT: movzwl %si, %eax
; X64-SLOW-NEXT: andb $15, %cl
; X64-SLOW-NEXT: shll %cl, %edi
; X64-SLOW-NEXT: xorb $15, %cl
; X64-SLOW-NEXT: shrl %eax
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: orl %esi, %eax
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %edi, %eax
; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
%tmp = tail call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
@ -128,26 +115,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i32:
; X86-SLOW: # %bb.0:
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB2_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %esi, %edi
; X86-SLOW-NEXT: movl %edi, %eax
; X86-SLOW-NEXT: .LBB2_2:
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: shrl %eax
; X86-SLOW-NEXT: andb $31, %cl
; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i32:
@ -160,17 +136,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: movl %esi, %eax
; X64-SLOW-NEXT: movl %edi, %esi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shll %cl, %esi
; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shll %cl, %edi
; X64-SLOW-NEXT: shrl %eax
; X64-SLOW-NEXT: andb $31, %cl
; X64-SLOW-NEXT: xorb $31, %cl
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: orl %esi, %eax
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %edi, %eax
; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
ret i32 %tmp
@ -279,78 +253,61 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-SLOW-NEXT: pushl %ebx
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: subl $8, %esp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: andl $63, %ebx
; X86-SLOW-NEXT: movb $64, %dh
; X86-SLOW-NEXT: subb %bl, %dh
; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movb %dh, %cl
; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: movb %dh, %dl
; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: movl %esi, %ebp
; X86-SLOW-NEXT: shll %cl, %ebp
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: je .LBB5_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %eax, %ebp
; X86-SLOW-NEXT: movl %ebp, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: .LBB5_2:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: movl %ebp, %eax
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: movb %bl, %ch
; X86-SLOW-NEXT: andb $31, %ch
; X86-SLOW-NEXT: movb $64, %ch
; X86-SLOW-NEXT: subb %bl, %ch
; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shrl %cl, %edx
; X86-SLOW-NEXT: andb $31, %cl
; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: addl %eax, %eax
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: movb %bl, %cl
; X86-SLOW-NEXT: shll %cl, %ebp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl %esi, %edi
; X86-SLOW-NEXT: shrl %edi
; X86-SLOW-NEXT: andb $31, %cl
; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb %ch, %ch
; X86-SLOW-NEXT: je .LBB5_4
; X86-SLOW-NEXT: # %bb.3:
; X86-SLOW-NEXT: orl %edi, %eax
; X86-SLOW-NEXT: movl %eax, %ebp
; X86-SLOW-NEXT: .LBB5_4:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: movb %bl, %cl
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: je .LBB5_6
; X86-SLOW-NEXT: jne .LBB5_1
; X86-SLOW-NEXT: # %bb.2:
; X86-SLOW-NEXT: orl %edi, %ebp
; X86-SLOW-NEXT: jmp .LBB5_3
; X86-SLOW-NEXT: .LBB5_1:
; X86-SLOW-NEXT: movl %esi, %ebp
; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: .LBB5_3:
; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb $32, %ch
; X86-SLOW-NEXT: jne .LBB5_4
; X86-SLOW-NEXT: # %bb.5:
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: movl %eax, %ecx
; X86-SLOW-NEXT: jmp .LBB5_6
; X86-SLOW-NEXT: .LBB5_4:
; X86-SLOW-NEXT: movl %edi, %ecx
; X86-SLOW-NEXT: xorl %edi, %edi
; X86-SLOW-NEXT: .LBB5_6:
; X86-SLOW-NEXT: movb %dh, %cl
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: testb $32, %dh
; X86-SLOW-NEXT: jne .LBB5_7
; X86-SLOW-NEXT: # %bb.8:
; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: jne .LBB5_10
; X86-SLOW-NEXT: jmp .LBB5_11
; X86-SLOW-NEXT: .LBB5_7:
; X86-SLOW-NEXT: movl %esi, %ecx
; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: je .LBB5_11
; X86-SLOW-NEXT: .LBB5_10:
; X86-SLOW-NEXT: orl %esi, %ebp
; X86-SLOW-NEXT: orl %ecx, %edi
; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edi, %eax
; X86-SLOW-NEXT: .LBB5_11:
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SLOW-NEXT: addl $8, %esp
; X86-SLOW-NEXT: je .LBB5_8
; X86-SLOW-NEXT: # %bb.7:
; X86-SLOW-NEXT: orl %edi, %ebp
; X86-SLOW-NEXT: orl %ecx, %esi
; X86-SLOW-NEXT: movl %ebp, %edx
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: .LBB5_8:
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: popl %ebx
@ -367,17 +324,15 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movq %rdx, %rcx
; X64-SLOW-NEXT: movq %rsi, %rax
; X64-SLOW-NEXT: movq %rdi, %rsi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shlq %cl, %rsi
; X64-SLOW-NEXT: andb $63, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: shlq %cl, %rdi
; X64-SLOW-NEXT: shrq %rax
; X64-SLOW-NEXT: andb $63, %cl
; X64-SLOW-NEXT: xorb $63, %cl
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-SLOW-NEXT: shrq %cl, %rax
; X64-SLOW-NEXT: orq %rsi, %rax
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmoveq %rdi, %rax
; X64-SLOW-NEXT: orq %rdi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
ret i64 %tmp

View File

@ -65,27 +65,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i16:
; X86-SLOW: # %bb.0:
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: andb $15, %dl
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: movb $16, %cl
; X86-SLOW-NEXT: subb %dl, %cl
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB1_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %esi
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: .LBB1_2:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-SLOW-NEXT: andb $15, %cl
; X86-SLOW-NEXT: shrl %cl, %edx
; X86-SLOW-NEXT: addl %eax, %eax
; X86-SLOW-NEXT: xorb $15, %cl
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i16:
@ -100,16 +89,16 @@ define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i16:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movzwl %si, %eax
; X64-SLOW-NEXT: andb $15, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrl %cl, %eax
; X64-SLOW-NEXT: movb $16, %cl
; X64-SLOW-NEXT: subb %dl, %cl
; X64-SLOW-NEXT: shll %cl, %edi
; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %esi, %eax
; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLOW-NEXT: movzwl %si, %edx
; X64-SLOW-NEXT: andb $15, %cl
; X64-SLOW-NEXT: shrl %cl, %edx
; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax
; X64-SLOW-NEXT: xorb $15, %cl
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shll %cl, %eax
; X64-SLOW-NEXT: orl %edx, %eax
; X64-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
; X64-SLOW-NEXT: retq
%tmp = tail call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
@ -127,26 +116,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X86-SLOW-LABEL: var_shift_i32:
; X86-SLOW: # %bb.0:
; X86-SLOW-NEXT: pushl %edi
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %dl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, %edi
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: andb $31, %dl
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testb %dl, %dl
; X86-SLOW-NEXT: je .LBB2_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %esi
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: .LBB2_2:
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
; X86-SLOW-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: shrl %cl, %edx
; X86-SLOW-NEXT: addl %eax, %eax
; X86-SLOW-NEXT: andb $31, %cl
; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: orl %edx, %eax
; X86-SLOW-NEXT: retl
;
; X64-FAST-LABEL: var_shift_i32:
@ -159,17 +137,15 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i32:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movl %edi, %eax
; X64-SLOW-NEXT: movl %esi, %edi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrl %cl, %edi
; X64-SLOW-NEXT: andb $31, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLOW-NEXT: shrl %cl, %esi
; X64-SLOW-NEXT: leal (%rdi,%rdi), %eax
; X64-SLOW-NEXT: andb $31, %cl
; X64-SLOW-NEXT: xorb $31, %cl
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SLOW-NEXT: shll %cl, %eax
; X64-SLOW-NEXT: orl %edi, %eax
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmovel %esi, %eax
; X64-SLOW-NEXT: orl %esi, %eax
; X64-SLOW-NEXT: retq
%tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
ret i32 %tmp
@ -276,76 +252,61 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: subl $8, %esp
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-SLOW-NEXT: andl $63, %ebx
; X86-SLOW-NEXT: movb $64, %al
; X86-SLOW-NEXT: subb %bl, %al
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl %eax, %ecx
; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: movb %al, %ch
; X86-SLOW-NEXT: andb $31, %ch
; X86-SLOW-NEXT: movb $64, %ch
; X86-SLOW-NEXT: subb %bl, %ch
; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: movl %esi, %edi
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb %ch, %ch
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: je .LBB5_2
; X86-SLOW-NEXT: # %bb.1:
; X86-SLOW-NEXT: orl %edi, %edx
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: .LBB5_2:
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edx
; X86-SLOW-NEXT: movb %bl, %ah
; X86-SLOW-NEXT: andb $31, %ah
; X86-SLOW-NEXT: movb %ah, %cl
; X86-SLOW-NEXT: negb %cl
; X86-SLOW-NEXT: movl %ebp, %edi
; X86-SLOW-NEXT: shll %cl, %edi
; X86-SLOW-NEXT: testb %ah, %ah
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-SLOW-NEXT: je .LBB5_4
; X86-SLOW-NEXT: # %bb.3:
; X86-SLOW-NEXT: orl %edx, %edi
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: .LBB5_4:
; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %esi, %edx
; X86-SLOW-NEXT: andb $31, %cl
; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: shrl %esi
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: movb %bl, %cl
; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: andb $31, %cl
; X86-SLOW-NEXT: xorb $31, %cl
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-SLOW-NEXT: movl %ebx, %ecx
; X86-SLOW-NEXT: leal (%edi,%edi), %ebp
; X86-SLOW-NEXT: shll %cl, %ebp
; X86-SLOW-NEXT: movb %bl, %cl
; X86-SLOW-NEXT: shrl %cl, %edi
; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: je .LBB5_6
; X86-SLOW-NEXT: # %bb.5:
; X86-SLOW-NEXT: jne .LBB5_1
; X86-SLOW-NEXT: # %bb.2:
; X86-SLOW-NEXT: orl %eax, %ebp
; X86-SLOW-NEXT: jmp .LBB5_3
; X86-SLOW-NEXT: .LBB5_1:
; X86-SLOW-NEXT: movl %edi, %ebp
; X86-SLOW-NEXT: xorl %edi, %edi
; X86-SLOW-NEXT: .LBB5_3:
; X86-SLOW-NEXT: movb %ch, %cl
; X86-SLOW-NEXT: shll %cl, %edx
; X86-SLOW-NEXT: testb $32, %ch
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SLOW-NEXT: jne .LBB5_4
; X86-SLOW-NEXT: # %bb.5:
; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SLOW-NEXT: orl %esi, %ecx
; X86-SLOW-NEXT: jmp .LBB5_6
; X86-SLOW-NEXT: .LBB5_4:
; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: movl $0, (%esp) # 4-byte Folded Spill
; X86-SLOW-NEXT: .LBB5_6:
; X86-SLOW-NEXT: movl %eax, %ecx
; X86-SLOW-NEXT: shll %cl, %esi
; X86-SLOW-NEXT: testb $32, %al
; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SLOW-NEXT: jne .LBB5_7
; X86-SLOW-NEXT: # %bb.8:
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: je .LBB5_8
; X86-SLOW-NEXT: # %bb.7:
; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: jne .LBB5_10
; X86-SLOW-NEXT: jmp .LBB5_11
; X86-SLOW-NEXT: .LBB5_7:
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: xorl %esi, %esi
; X86-SLOW-NEXT: testl %ebx, %ebx
; X86-SLOW-NEXT: je .LBB5_11
; X86-SLOW-NEXT: .LBB5_10:
; X86-SLOW-NEXT: orl %ebp, %esi
; X86-SLOW-NEXT: orl %edi, %eax
; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %eax, %edx
; X86-SLOW-NEXT: .LBB5_11:
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SLOW-NEXT: orl %ebp, %eax
; X86-SLOW-NEXT: orl %edi, %ecx
; X86-SLOW-NEXT: movl %ecx, %edx
; X86-SLOW-NEXT: .LBB5_8:
; X86-SLOW-NEXT: addl $8, %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
@ -363,17 +324,14 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
;
; X64-SLOW-LABEL: var_shift_i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: movq %rdi, %rax
; X64-SLOW-NEXT: movq %rsi, %rdi
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: shrq %cl, %rdi
; X64-SLOW-NEXT: andb $63, %dl
; X64-SLOW-NEXT: movl %edx, %ecx
; X64-SLOW-NEXT: negb %cl
; X64-SLOW-NEXT: movq %rdx, %rcx
; X64-SLOW-NEXT: shrq %cl, %rsi
; X64-SLOW-NEXT: leaq (%rdi,%rdi), %rax
; X64-SLOW-NEXT: andb $63, %cl
; X64-SLOW-NEXT: xorb $63, %cl
; X64-SLOW-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-SLOW-NEXT: shlq %cl, %rax
; X64-SLOW-NEXT: orq %rdi, %rax
; X64-SLOW-NEXT: testb %dl, %dl
; X64-SLOW-NEXT: cmoveq %rsi, %rax
; X64-SLOW-NEXT: orq %rsi, %rax
; X64-SLOW-NEXT: retq
%tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
ret i64 %tmp