mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[X86] Add test case for a regression from D76649. NFC
When combineLoopMAdd was moved to IR we got stricter about ensuring the truncate was free. This prevents us from matching this sum of squares of byte differences pattern show here. We used to get this case when it was in SelectionDAG.
This commit is contained in:
parent
424e152158
commit
f528918f5f
@ -2930,3 +2930,132 @@ middle.block:
|
||||
%tmp30 = or i64 %tmp28, %tmp29
|
||||
ret i64 %tmp30
|
||||
}
|
||||
|
||||
define i32 @sum_of_square_differences(i8* %a, i8* %b, i32 %n) {
|
||||
; SSE2-LABEL: sum_of_square_differences:
|
||||
; SSE2: # %bb.0: # %entry
|
||||
; SSE2-NEXT: movl %edx, %eax
|
||||
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE2-NEXT: xorl %ecx, %ecx
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: .p2align 4, 0x90
|
||||
; SSE2-NEXT: .LBB34_1: # %vector.body
|
||||
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
|
||||
; SSE2-NEXT: psubw %xmm3, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm3
|
||||
; SSE2-NEXT: pmulhw %xmm4, %xmm3
|
||||
; SSE2-NEXT: pmullw %xmm4, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm5
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm0
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: addq $8, %rcx
|
||||
; SSE2-NEXT: cmpq %rcx, %rax
|
||||
; SSE2-NEXT: jne .LBB34_1
|
||||
; SSE2-NEXT: # %bb.2: # %middle.block
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm0
|
||||
; SSE2-NEXT: movd %xmm0, %eax
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: sum_of_square_differences:
|
||||
; AVX1: # %bb.0: # %entry
|
||||
; AVX1-NEXT: movl %edx, %eax
|
||||
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: xorl %ecx, %ecx
|
||||
; AVX1-NEXT: .p2align 4, 0x90
|
||||
; AVX1-NEXT: .LBB34_1: # %vector.body
|
||||
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: addq $8, %rcx
|
||||
; AVX1-NEXT: cmpq %rcx, %rax
|
||||
; AVX1-NEXT: jne .LBB34_1
|
||||
; AVX1-NEXT: # %bb.2: # %middle.block
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovd %xmm0, %eax
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX256-LABEL: sum_of_square_differences:
|
||||
; AVX256: # %bb.0: # %entry
|
||||
; AVX256-NEXT: movl %edx, %eax
|
||||
; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX256-NEXT: xorl %ecx, %ecx
|
||||
; AVX256-NEXT: .p2align 4, 0x90
|
||||
; AVX256-NEXT: .LBB34_1: # %vector.body
|
||||
; AVX256-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
|
||||
; AVX256-NEXT: vpsubd %ymm1, %ymm2, %ymm1
|
||||
; AVX256-NEXT: vpmulld %ymm1, %ymm1, %ymm1
|
||||
; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0
|
||||
; AVX256-NEXT: addq $8, %rcx
|
||||
; AVX256-NEXT: cmpq %rcx, %rax
|
||||
; AVX256-NEXT: jne .LBB34_1
|
||||
; AVX256-NEXT: # %bb.2: # %middle.block
|
||||
; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX256-NEXT: vmovd %xmm0, %eax
|
||||
; AVX256-NEXT: vzeroupper
|
||||
; AVX256-NEXT: retq
|
||||
entry:
|
||||
%0 = zext i32 %n to i64
|
||||
br label %vector.body
|
||||
|
||||
vector.body:
|
||||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
|
||||
%vec.phi = phi <8 x i32> [ %9, %vector.body ], [ zeroinitializer, %entry ]
|
||||
%1 = getelementptr inbounds i8, i8* %a, i64 %index
|
||||
%2 = bitcast i8* %1 to <8 x i8>*
|
||||
%wide.load = load <8 x i8>, <8 x i8>* %2, align 1
|
||||
%3 = zext <8 x i8> %wide.load to <8 x i32>
|
||||
%4 = getelementptr inbounds i8, i8* %b, i64 %index
|
||||
%5 = bitcast i8* %4 to <8 x i8>*
|
||||
%wide.load2 = load <8 x i8>, <8 x i8>* %5, align 1
|
||||
%6 = zext <8 x i8> %wide.load2 to <8 x i32>
|
||||
%7 = sub <8 x i32> %6, %3
|
||||
%8 = mul <8 x i32> %7, %7
|
||||
%9 = add nsw <8 x i32> %8, %vec.phi
|
||||
%index.next = add i64 %index, 8
|
||||
%10 = icmp eq i64 %index.next, %0
|
||||
br i1 %10, label %middle.block, label %vector.body
|
||||
|
||||
middle.block:
|
||||
%rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%bin.rdx = add <8 x i32> %9, %rdx.shuf
|
||||
%rdx.shuf31 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%bin.rdx32 = add <8 x i32> %bin.rdx, %rdx.shuf31
|
||||
%rdx.shuf33 = shufflevector <8 x i32> %bin.rdx32, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%bin.rdx34 = add <8 x i32> %bin.rdx32, %rdx.shuf33
|
||||
%11 = extractelement <8 x i32> %bin.rdx34, i32 0
|
||||
ret i32 %11
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user