mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
598a86fcc7
Summary: Currently we only use min/max to help with ule/uge compares because it removes an invert of the result that would otherwise be needed. But we can also use it for ult/ugt compares if it will prevent the need for a sign bit flip needed to use pcmpgt at the cost of requiring an invert after the compare. I also refactored the code so that the max/min code is self contained and does its own return instead of setting up a flag to manipulate the rest of the function's behavior. Most of the test cases look ok with this. I did notice that we added instructions when one of the operands being sign flipped is a constant vector that we were able to constant fold the flip into. I also noticed that sometimes the SSE min/max clobbers a register that is needed after the compare. This resulted in an extra move being inserted before the min/max to preserve the register. We could try to detect this and switch from min to max and change the compare operands to use the operand that gets reused in the compare. Reviewers: spatel, RKSimon Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D42935 llvm-svn: 324842
186 lines
7.2 KiB
LLVM
186 lines
7.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
|
|
; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE41
|
|
|
|
; For a setult against a constant, turn it into a setule and lower via psubusw.
|
|
|
|
define void @loop_no_const_reload(<2 x i64>* %in, <2 x i64>* %out, i32 %n) {
|
|
; SSE2-LABEL: loop_no_const_reload:
|
|
; SSE2: ## %bb.0: ## %entry
|
|
; SSE2-NEXT: testl %edx, %edx
|
|
; SSE2-NEXT: je LBB0_3
|
|
; SSE2-NEXT: ## %bb.1: ## %for.body.preheader
|
|
; SSE2-NEXT: xorl %eax, %eax
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
|
|
; SSE2-NEXT: pxor %xmm1, %xmm1
|
|
; SSE2-NEXT: .p2align 4, 0x90
|
|
; SSE2-NEXT: LBB0_2: ## %for.body
|
|
; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
|
|
; SSE2-NEXT: movdqa (%rdi,%rax), %xmm2
|
|
; SSE2-NEXT: psubusw %xmm0, %xmm2
|
|
; SSE2-NEXT: pcmpeqw %xmm1, %xmm2
|
|
; SSE2-NEXT: movdqa %xmm2, (%rsi,%rax)
|
|
; SSE2-NEXT: addq $16, %rax
|
|
; SSE2-NEXT: decl %edx
|
|
; SSE2-NEXT: jne LBB0_2
|
|
; SSE2-NEXT: LBB0_3: ## %for.end
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE41-LABEL: loop_no_const_reload:
|
|
; SSE41: ## %bb.0: ## %entry
|
|
; SSE41-NEXT: testl %edx, %edx
|
|
; SSE41-NEXT: je LBB0_3
|
|
; SSE41-NEXT: ## %bb.1: ## %for.body.preheader
|
|
; SSE41-NEXT: xorl %eax, %eax
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [26,26,26,26,26,26,26,26]
|
|
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
|
|
; SSE41-NEXT: .p2align 4, 0x90
|
|
; SSE41-NEXT: LBB0_2: ## %for.body
|
|
; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1
|
|
; SSE41-NEXT: movdqa (%rdi,%rax), %xmm2
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm3
|
|
; SSE41-NEXT: pmaxuw %xmm0, %xmm3
|
|
; SSE41-NEXT: pcmpeqw %xmm2, %xmm3
|
|
; SSE41-NEXT: pxor %xmm1, %xmm3
|
|
; SSE41-NEXT: movdqa %xmm3, (%rsi,%rax)
|
|
; SSE41-NEXT: addq $16, %rax
|
|
; SSE41-NEXT: decl %edx
|
|
; SSE41-NEXT: jne LBB0_2
|
|
; SSE41-NEXT: LBB0_3: ## %for.end
|
|
; SSE41-NEXT: retq
|
|
entry:
|
|
%cmp9 = icmp eq i32 %n, 0
|
|
br i1 %cmp9, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
%arrayidx1 = getelementptr inbounds <2 x i64>, <2 x i64>* %in, i64 %indvars.iv
|
|
%arrayidx1.val = load <2 x i64>, <2 x i64>* %arrayidx1, align 16
|
|
%0 = bitcast <2 x i64> %arrayidx1.val to <8 x i16>
|
|
%cmp.i.i = icmp ult <8 x i16> %0, <i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26>
|
|
%sext.i.i = sext <8 x i1> %cmp.i.i to <8 x i16>
|
|
%1 = bitcast <8 x i16> %sext.i.i to <2 x i64>
|
|
%arrayidx5 = getelementptr inbounds <2 x i64>, <2 x i64>* %out, i64 %indvars.iv
|
|
store <2 x i64> %1, <2 x i64>* %arrayidx5, align 16
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
; Be careful if decrementing the constant would undeflow.
|
|
|
|
define void @loop_const_folding_underflow(<2 x i64>* %in, <2 x i64>* %out, i32 %n) {
|
|
; SSE2-LABEL: loop_const_folding_underflow:
|
|
; SSE2: ## %bb.0: ## %entry
|
|
; SSE2-NEXT: testl %edx, %edx
|
|
; SSE2-NEXT: je LBB1_3
|
|
; SSE2-NEXT: ## %bb.1: ## %for.body.preheader
|
|
; SSE2-NEXT: xorl %eax, %eax
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32794,32794,32794,32794,32794,32794,32794]
|
|
; SSE2-NEXT: .p2align 4, 0x90
|
|
; SSE2-NEXT: LBB1_2: ## %for.body
|
|
; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
|
|
; SSE2-NEXT: movdqa (%rdi,%rax), %xmm2
|
|
; SSE2-NEXT: pxor %xmm0, %xmm2
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
|
; SSE2-NEXT: pcmpgtw %xmm2, %xmm3
|
|
; SSE2-NEXT: movdqa %xmm3, (%rsi,%rax)
|
|
; SSE2-NEXT: addq $16, %rax
|
|
; SSE2-NEXT: decl %edx
|
|
; SSE2-NEXT: jne LBB1_2
|
|
; SSE2-NEXT: LBB1_3: ## %for.end
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE41-LABEL: loop_const_folding_underflow:
|
|
; SSE41: ## %bb.0: ## %entry
|
|
; SSE41-NEXT: testl %edx, %edx
|
|
; SSE41-NEXT: je LBB1_3
|
|
; SSE41-NEXT: ## %bb.1: ## %for.body.preheader
|
|
; SSE41-NEXT: xorl %eax, %eax
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26]
|
|
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
|
|
; SSE41-NEXT: .p2align 4, 0x90
|
|
; SSE41-NEXT: LBB1_2: ## %for.body
|
|
; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1
|
|
; SSE41-NEXT: movdqa (%rdi,%rax), %xmm2
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm3
|
|
; SSE41-NEXT: pmaxuw %xmm0, %xmm3
|
|
; SSE41-NEXT: pcmpeqw %xmm2, %xmm3
|
|
; SSE41-NEXT: pxor %xmm1, %xmm3
|
|
; SSE41-NEXT: movdqa %xmm3, (%rsi,%rax)
|
|
; SSE41-NEXT: addq $16, %rax
|
|
; SSE41-NEXT: decl %edx
|
|
; SSE41-NEXT: jne LBB1_2
|
|
; SSE41-NEXT: LBB1_3: ## %for.end
|
|
; SSE41-NEXT: retq
|
|
entry:
|
|
%cmp9 = icmp eq i32 %n, 0
|
|
br i1 %cmp9, label %for.end, label %for.body
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
|
%arrayidx1 = getelementptr inbounds <2 x i64>, <2 x i64>* %in, i64 %indvars.iv
|
|
%arrayidx1.val = load <2 x i64>, <2 x i64>* %arrayidx1, align 16
|
|
%0 = bitcast <2 x i64> %arrayidx1.val to <8 x i16>
|
|
%cmp.i.i = icmp ult <8 x i16> %0, <i16 0, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26>
|
|
%sext.i.i = sext <8 x i1> %cmp.i.i to <8 x i16>
|
|
%1 = bitcast <8 x i16> %sext.i.i to <2 x i64>
|
|
%arrayidx5 = getelementptr inbounds <2 x i64>, <2 x i64>* %out, i64 %indvars.iv
|
|
store <2 x i64> %1, <2 x i64>* %arrayidx5, align 16
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %n
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
; Test for PSUBUSB
|
|
|
|
define <16 x i8> @test_ult_byte(<16 x i8> %a) {
|
|
; CHECK-LABEL: test_ult_byte:
|
|
; CHECK: ## %bb.0: ## %entry
|
|
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
|
|
; CHECK-NEXT: pmaxub %xmm0, %xmm1
|
|
; CHECK-NEXT: pcmpeqb %xmm1, %xmm0
|
|
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
|
|
; CHECK-NEXT: pxor %xmm1, %xmm0
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%icmp = icmp ult <16 x i8> %a, <i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11>
|
|
%sext = sext <16 x i1> %icmp to <16 x i8>
|
|
ret <16 x i8> %sext
|
|
}
|
|
|
|
; Only do this when we can turn the comparison into a setule. I.e. not for
|
|
; register operands.
|
|
|
|
define <8 x i16> @test_ult_register(<8 x i16> %a, <8 x i16> %b) {
|
|
; SSE2-LABEL: test_ult_register:
|
|
; SSE2: ## %bb.0: ## %entry
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
|
|
; SSE2-NEXT: pxor %xmm2, %xmm0
|
|
; SSE2-NEXT: pxor %xmm1, %xmm2
|
|
; SSE2-NEXT: pcmpgtw %xmm0, %xmm2
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm0
|
|
; SSE2-NEXT: retq
|
|
;
|
|
; SSE41-LABEL: test_ult_register:
|
|
; SSE41: ## %bb.0: ## %entry
|
|
; SSE41-NEXT: pmaxuw %xmm0, %xmm1
|
|
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
|
|
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
|
|
; SSE41-NEXT: pxor %xmm1, %xmm0
|
|
; SSE41-NEXT: retq
|
|
entry:
|
|
%icmp = icmp ult <8 x i16> %a, %b
|
|
%sext = sext <8 x i1> %icmp to <8 x i16>
|
|
ret <8 x i16> %sext
|
|
}
|