mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
74c9f1897a
Summary: The 2 source operands commutable instructions are encoded in the VEX.VVVV field and the r/m field of the MODRM byte plus the VEX.B field. The VEX.B field is missing from the 2-byte VEX encoding. If the VEX.VVVV source is 0-7 and the other register is 8-15 we can swap them to avoid needing the VEX.B field. This works as long as the VEX.W, VEX.mmmmm, and VEX.X fields are also not needed. Fixes PR36706. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68550
102 lines
6.4 KiB
LLVM
102 lines
6.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s
|
|
|
|
declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
|
|
|
|
; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.
|
|
|
|
define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
|
|
; CHECK-LABEL: bar:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: subq $72, %rsp
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 80
|
|
; CHECK-NEXT: vmovaps %xmm1, %xmm9
|
|
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17]
|
|
; CHECK-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14
|
|
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22]
|
|
; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
|
|
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
|
|
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
|
|
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
|
|
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
|
|
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6
|
|
; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
|
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
|
|
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
|
|
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,1,2,3]
|
|
; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
|
|
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
|
|
; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
|
|
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3]
|
|
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
|
|
; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2
|
|
; CHECK-NEXT: vmovaps %xmm13, %xmm1
|
|
; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
|
; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10
|
|
; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3
|
|
; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0
|
|
; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: vmovaps %xmm10, (%rsp)
|
|
; CHECK-NEXT: vmovaps %xmm9, %xmm3
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: callq foo
|
|
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
|
|
; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
|
|
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
|
; CHECK-NEXT: addq $72, %rsp
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 8
|
|
; CHECK-NEXT: retq
|
|
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
|
|
%a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
|
|
%a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
|
|
%a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
|
|
%a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
|
|
%a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
|
|
%ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
|
|
%ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
|
|
%ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
|
|
%ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
|
|
%ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
|
|
%ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
|
|
%ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
|
|
%ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
|
|
%ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
|
|
|
|
%r1 = fadd <4 x float> %ay10, %ay9
|
|
%r2 = fadd <4 x float> %ay8, %ay7
|
|
%r3 = fadd <4 x float> %ay6, %ay5
|
|
%r4 = fadd <4 x float> %ay2, %ax10
|
|
%r5 = fadd <4 x float> %ay9, %ax8
|
|
%r6 = fadd <4 x float> %r5, %r3
|
|
%r7 = fadd <4 x float> %a9, %r6
|
|
%a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
|
|
%a12 = fadd <4 x float> %a2, %a1
|
|
%a13 = fadd <4 x float> %a12, %a11
|
|
|
|
ret <4 x float> %a13
|
|
}
|