1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 11:33:24 +02:00

Mark FMA4 instructions as commutable and add them to the folding tables.

llvm-svn: 163035
This commit is contained in:
Craig Topper 2012-08-31 23:10:34 +00:00
parent 48ba96b707
commit 2e53378ff6
3 changed files with 93 additions and 0 deletions

View File

@ -200,6 +200,7 @@ defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
PatFrag mem_frag> {
let isCommutable = 1 in
def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
@ -228,6 +229,7 @@ let isCodeGenOnly = 1 in
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
ComplexPattern mem_cpat, Intrinsic Int> {
let isCommutable = 1 in
def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@ -251,6 +253,7 @@ multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT128, ValueType OpVT256,
PatFrag ld_frag128, PatFrag ld_frag256> {
let isCommutable = 1 in
def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@ -270,6 +273,7 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
let isCommutable = 1 in
def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,

View File

@ -1110,6 +1110,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 },
{ X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 },
// FIXME: add AVX 256-bit foldable instructions
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_16 },
{ X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_16 },
{ X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 },
{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 },
{ X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 },
{ X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 },
{ X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 },
{ X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_16 },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_16 },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 },
{ X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 },
{ X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 },
{ X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 },
{ X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 },
{ X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 },
{ X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 },
{ X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 },
{ X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@ -1237,6 +1267,36 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 },
{ X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 },
{ X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 },
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_16 },
{ X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_16 },
{ X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 },
{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 },
{ X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 },
{ X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 },
{ X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 },
{ X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_16 },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_16 },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 },
{ X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 },
{ X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 },
{ X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 },
{ X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 },
{ X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 },
{ X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 },
{ X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 },
{ X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {

View File

@ -181,3 +181,32 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
%res = fsub float %y, %a2
ret float %res
}
; CHECK: test_x86_fmadd_ps
; CHECK: vmovaps (%rdi), %xmm2
; CHECK: vfmadd213ps %xmm1, %xmm0, %xmm2
; CHECK: ret
; CHECK_FMA4: test_x86_fmadd_ps
; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
; CHECK_FMA4: ret
define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
%x = load <4 x float>* %a0
%y = fmul <4 x float> %x, %a1
%res = fadd <4 x float> %y, %a2
ret <4 x float> %res
}
; CHECK: test_x86_fmsub_ps
; CHECK: vmovaps (%rdi), %xmm2
; CHECK: fmsub213ps %xmm1, %xmm0, %xmm2
; CHECK: ret
; CHECK_FMA4: test_x86_fmsub_ps
; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
; CHECK_FMA4: ret
define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
%x = load <4 x float>* %a0
%y = fmul <4 x float> %x, %a1
%res = fsub <4 x float> %y, %a2
ret <4 x float> %res
}