1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[x86] swap order of srl (and X, C1), C2 when it saves size

The (non-)obvious win comes from saving 3 bytes by using the 0x83 'and' opcode variant instead of 0x81. 
There are also better improvements based on known-bits that allow us to eliminate the mask entirely.

As noted, this could be extended. There are potentially other wins from always shifting first, but doing
that reveals a tangle of problems in other pattern matching. We do this transform generically in 
instcombine, but we often have icmp IR that doesn't match that pattern, so we must account for this
in the backend.

Differential Revision: https://reviews.llvm.org/D38181

llvm-svn: 314023
This commit is contained in:
Sanjay Patel 2017-09-22 19:37:21 +00:00
parent 9d5fb1b5c3
commit d30aaf33b0
9 changed files with 333 additions and 290 deletions

View File

@ -31762,6 +31762,40 @@ static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
// TODO: This is a generic DAG combine that became an x86-only combine to
// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
// and-not ('andn').
if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
return SDValue();
auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!ShiftC || !AndC)
return SDValue();
// If the 'and' mask is already smaller than a byte, then don't bother.
// If the new 'and' mask would be bigger than a byte, then don't bother.
// If the mask fits in a byte, then we know we can generate smaller and
// potentially better code by shifting first.
// TODO: Always try to shrink a mask that is over 32-bits?
APInt MaskVal = AndC->getAPIntValue();
APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
if (MaskVal.getMinSignedBits() <= 8 || NewMaskVal.getMinSignedBits() > 8)
return SDValue();
// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
SDLoc DL(N);
SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
}
/// \brief Returns a vector of 0s if the node in input is a vector logical
/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
@ -31804,6 +31838,10 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
if (SDValue V = combineShiftRightAlgebraic(N, DAG))
return V;
if (N->getOpcode() == ISD::SRL)
if (SDValue V = combineShiftRightLogical(N, DAG))
return V;
// Try to fold this logical shift into a zero vector.
if (N->getOpcode() != ISD::SRA)
if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))

View File

@ -129,8 +129,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
; X32-NEXT: vpmovb2m %zmm1, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $61440, %ecx # imm = 0xF000
; X32-NEXT: shrl $12, %ecx
; X32-NEXT: andl $15, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm1
; X32-NEXT: vpbroadcastd %xmm1, %xmm1
@ -151,8 +151,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
; X32-NEXT: vpmovb2m %zmm1, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $49152, %ecx # imm = 0xC000
; X32-NEXT: shrl $14, %ecx
; X32-NEXT: andl $3, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm1
; X32-NEXT: vpbroadcastw %xmm1, %xmm1
@ -162,8 +162,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
; X32-NEXT: vpmovb2m %zmm1, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $32768, %ecx # imm = 0x8000
; X32-NEXT: shrl $15, %ecx
; X32-NEXT: andl $1, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm1
; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
@ -483,8 +483,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $61440, %ecx # imm = 0xF000
; X32-NEXT: shrl $12, %ecx
; X32-NEXT: andl $15, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpbroadcastd %xmm0, %xmm0
@ -507,8 +507,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $49152, %ecx # imm = 0xC000
; X32-NEXT: shrl $14, %ecx
; X32-NEXT: andl $3, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpbroadcastw %xmm0, %xmm0
@ -519,8 +519,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $32768, %ecx # imm = 0x8000
; X32-NEXT: shrl $15, %ecx
; X32-NEXT: andl $1, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
@ -860,8 +860,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $61440, %ecx # imm = 0xF000
; X32-NEXT: shrl $12, %ecx
; X32-NEXT: andl $15, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpbroadcastd %xmm0, %xmm0
@ -882,8 +882,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $49152, %ecx # imm = 0xC000
; X32-NEXT: shrl $14, %ecx
; X32-NEXT: andl $3, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpbroadcastw %xmm0, %xmm0
@ -893,8 +893,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $32768, %ecx # imm = 0x8000
; X32-NEXT: shrl $15, %ecx
; X32-NEXT: andl $1, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
@ -1214,8 +1214,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $61440, %ecx # imm = 0xF000
; X32-NEXT: shrl $12, %ecx
; X32-NEXT: andl $15, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpbroadcastd %xmm0, %xmm0
@ -1238,8 +1238,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $49152, %ecx # imm = 0xC000
; X32-NEXT: shrl $14, %ecx
; X32-NEXT: andl $3, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpbroadcastw %xmm0, %xmm0
@ -1250,8 +1250,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X32-NEXT: vpmovb2m %zmm0, %k0
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: andl $32768, %ecx # imm = 0x8000
; X32-NEXT: shrl $15, %ecx
; X32-NEXT: andl $1, %ecx
; X32-NEXT: kmovd %ecx, %k1
; X32-NEXT: vpmovm2b %k1, %zmm0
; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]

File diff suppressed because it is too large Load Diff

View File

@ -48,16 +48,16 @@ define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) nounwind readnone ssp nor
; X32: # BB#0: # %entry
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: imull $171, %eax, %eax
; X32-NEXT: andl $65024, %eax # imm = 0xFE00
; X32-NEXT: shrl $9, %eax
; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X32-NEXT: retl
;
; X64-LABEL: test3:
; X64: # BB#0: # %entry
; X64-NEXT: imull $171, %esi, %eax
; X64-NEXT: andl $65024, %eax # imm = 0xFE00
; X64-NEXT: shrl $9, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X64-NEXT: retq
entry:
@ -167,8 +167,8 @@ define i8 @test8(i8 %x) nounwind {
; X32-NEXT: shrb %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: imull $211, %eax, %eax
; X32-NEXT: andl $24576, %eax # imm = 0x6000
; X32-NEXT: shrl $13, %eax
; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X32-NEXT: retl
;
@ -177,8 +177,8 @@ define i8 @test8(i8 %x) nounwind {
; X64-NEXT: shrb %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $211, %eax, %eax
; X64-NEXT: andl $24576, %eax # imm = 0x6000
; X64-NEXT: shrl $13, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X64-NEXT: retq
%div = udiv i8 %x, 78
@ -192,8 +192,8 @@ define i8 @test9(i8 %x) nounwind {
; X32-NEXT: shrb $2, %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: imull $71, %eax, %eax
; X32-NEXT: andl $6144, %eax # imm = 0x1800
; X32-NEXT: shrl $11, %eax
; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X32-NEXT: retl
;
@ -202,8 +202,8 @@ define i8 @test9(i8 %x) nounwind {
; X64-NEXT: shrb $2, %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $71, %eax, %eax
; X64-NEXT: andl $6144, %eax # imm = 0x1800
; X64-NEXT: shrl $11, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X64-NEXT: retq
%div = udiv i8 %x, 116

View File

@ -12,8 +12,8 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzbl (%eax), %eax
; X32-NEXT: imull $101, %eax, %eax
; X32-NEXT: andl $16384, %eax # imm = 0x4000
; X32-NEXT: shrl $14, %eax
; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@ -50,8 +50,8 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
; X64: # BB#0: # %BB
; X64-NEXT: movzbl (%rdi), %eax
; X64-NEXT: imull $101, %eax, %eax
; X64-NEXT: andl $16384, %eax # imm = 0x4000
; X64-NEXT: shrl $14, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero

View File

@ -12,10 +12,9 @@ define void @foo(i32 %a) {
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: shrl $23, %eax
; CHECK-NEXT: testb $1, %ah
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: shrl $23, %edi
; CHECK-NEXT: btl $8, %edi
; CHECK-NEXT: jb .LBB0_2
; CHECK-NEXT: # BB#1: # %true
; CHECK-NEXT: callq qux
; CHECK-NEXT: .LBB0_2: # %false

View File

@ -3,10 +3,10 @@
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=CHECK-32
; CHECK-64-LABEL: g64xh:
; CHECK-64: testb $8, {{%ah|%ch}}
; CHECK-64: btl $11
; CHECK-64: ret
; CHECK-32-LABEL: g64xh:
; CHECK-32: testb $8, %ah
; CHECK-32: btl $11
; CHECK-32: ret
define void @g64xh(i64 inreg %x) nounwind {
%t = and i64 %x, 2048
@ -37,10 +37,10 @@ no:
ret void
}
; CHECK-64-LABEL: g32xh:
; CHECK-64: testb $8, {{%ah|%ch}}
; CHECK-64: btl $11
; CHECK-64: ret
; CHECK-32-LABEL: g32xh:
; CHECK-32: testb $8, %ah
; CHECK-32: btl $11
; CHECK-32: ret
define void @g32xh(i32 inreg %x) nounwind {
%t = and i32 %x, 2048
@ -71,10 +71,10 @@ no:
ret void
}
; CHECK-64-LABEL: g16xh:
; CHECK-64: testb $8, {{%ah|%ch}}
; CHECK-64: btl $11
; CHECK-64: ret
; CHECK-32-LABEL: g16xh:
; CHECK-32: testb $8, %ah
; CHECK-32: btl $11
; CHECK-32: ret
define void @g16xh(i16 inreg %x) nounwind {
%t = and i16 %x, 2048

View File

@ -1,13 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
; computeKnownBits determines that we don't need a mask op that is required in the general case.
define i8 @foo(i8 %tmp325) {
; CHECK-LABEL: foo:
; CHECK: # BB#0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: imull $111, %ecx, %eax
; CHECK-NEXT: andl $28672, %eax # imm = 0x7000
; CHECK-NEXT: shrl $12, %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: movb $37, %dl
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: mulb %dl

View File

@ -17,15 +17,13 @@ define i32 @t(i32 %a, i32 %b) nounwind ssp {
;
; X64-LABEL: t:
; X64: # BB#0: # %entry
; X64-NEXT: movl %edi, %eax
; X64-NEXT: xorl %esi, %eax
; X64-NEXT: testb $64, %ah
; X64-NEXT: je .LBB0_1
; X64-NEXT: # BB#2: # %bb1
; X64-NEXT: xorl %esi, %edi
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: btl $14, %edi
; X64-NEXT: jae .LBB0_1
; X64-NEXT: # BB#2: # %bb1
; X64-NEXT: jmp bar # TAILCALL
; X64-NEXT: .LBB0_1: # %bb
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: jmp foo # TAILCALL
entry:
%0 = and i32 %a, 16384