1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86] Add test case for unfolding broadcast load from vpternlog.

This commit is contained in:
Craig Topper 2020-07-02 13:29:30 -07:00
parent 50ec534a2c
commit 64cb7dfab8

View File

@ -4633,3 +4633,51 @@ bb1: ; preds = %bb1, %bb
bb10: ; preds = %bb1
ret void
}
; The or/and pattern here should be turned into vpternlog. The multiply is
; there to increase the use count of the loads so they can't fold. We want to
; unfold the broadcast and pull it out of the loop.
define void @bcast_unfold_vpternlog_v16i32(i32* %arg, i32* %arg1) {
; CHECK-LABEL: bcast_unfold_vpternlog_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB131_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm1
; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm2
; CHECK-NEXT: vpternlogd $216, {{.*}}(%rip){1to16}, %zmm0, %zmm1
; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm0
; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB131_1
; CHECK-NEXT: # %bb.2: # %bb20
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
bb:
br label %bb2
bb2: ; preds = %bb2, %bb
%tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ]
%tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
%tmp4 = bitcast i32* %tmp3 to <16 x i32>*
%tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
%tmp6 = getelementptr inbounds i32, i32* %arg1, i64 %tmp
%tmp10 = bitcast i32* %tmp6 to <16 x i32>*
%tmp11 = load <16 x i32>, <16 x i32>* %tmp10, align 4
%tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
%tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
%tmp14 = or <16 x i32> %tmp12, %tmp13
%tmp15 = mul <16 x i32> %tmp14, %tmp5
%tmp16 = mul <16 x i32> %tmp15, %tmp11
%tmp17 = bitcast i32* %tmp3 to <16 x i32>*
store <16 x i32> %tmp16, <16 x i32>* %tmp17, align 4
%tmp18 = add i64 %tmp, 16
%tmp19 = icmp eq i64 %tmp18, 1024
br i1 %tmp19, label %bb20, label %bb2
bb20: ; preds = %bb2
ret void
}