1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

[X86] Add patterns to make blends with immediate control commutable during isel for load folding.

llvm-svn: 313476
This commit is contained in:
Craig Topper 2017-09-17 05:06:05 +00:00
parent d1020c20b9
commit 2a324415cd

View File

@ -6542,6 +6542,21 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
def BlendCommuteImm2 : SDNodeXForm<imm, [{
uint8_t Imm = N->getZExtValue() & 0x03;
return getI8Imm(Imm ^ 0x03, SDLoc(N));
}]>;
def BlendCommuteImm4 : SDNodeXForm<imm, [{
uint8_t Imm = N->getZExtValue() & 0x0f;
return getI8Imm(Imm ^ 0x0f, SDLoc(N));
}]>;
def BlendCommuteImm8 : SDNodeXForm<imm, [{
uint8_t Imm = N->getZExtValue() & 0xff;
return getI8Imm(Imm ^ 0xff, SDLoc(N));
}]>;
let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@ -6549,26 +6564,6 @@ let Predicates = [HasAVX] in {
DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
}
let ExeDomain = SSEPackedSingle in {
defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
VR128, loadv4f32, f128mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
VR256, loadv8f32, f256mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
let ExeDomain = SSEPackedDouble in {
defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
VR128, loadv2f64, f128mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
VR256, loadv4f64, f256mem, 0,
DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
VR128, loadv2i64, i128mem, 0,
DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, loadv4f32, f128mem, 0,
@ -6589,9 +6584,6 @@ let Predicates = [HasAVX2] in {
VR256, loadv4i64, i256mem, 0,
DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
VR256, loadv4i64, i256mem, 0,
DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
@ -6600,17 +6592,7 @@ let Constraints = "$src1 = $dst" in {
VR128, memopv2i64, i128mem,
1, SSE_MPSADBW_ITINS>;
}
let ExeDomain = SSEPackedSingle in
defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memopv4f32, f128mem,
1, SSE_INTALU_ITINS_FBLEND_P>;
let ExeDomain = SSEPackedDouble in
defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
VR128, memopv2f64, f128mem,
1, SSE_INTALU_ITINS_FBLEND_P>;
defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
VR128, memopv2i64, i128mem,
1, SSE_INTALU_ITINS_BLEND_P>;
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
VR128, memopv4f32, f128mem, 1,
@ -6621,6 +6603,82 @@ let Constraints = "$src1 = $dst" in {
SSE_DPPD_ITINS>;
}
/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
X86MemOperand x86memop, bit Is2Addr, Domain d,
OpndItins itins, SDNodeXForm commuteXForm> {
let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
let isCommutable = 1 in
def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
itins.rr>, Sched<[itins.Sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!if(Is2Addr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
(OpVT (OpNode RC:$src1,
(bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
// Pattern to commute if load is in first source.
def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
RC:$src1, imm:$src3)),
(!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
(commuteXForm imm:$src3))>;
}
let Predicates = [HasAVX] in {
defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
VEX_4V, VEX_WIG;
defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>,
VEX_4V, VEX_L, VEX_WIG;
defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>,
VEX_4V, VEX_WIG;
defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
VEX_4V, VEX_L, VEX_WIG;
defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
VR128, loadv2i64, i128mem, 0, SSEPackedInt,
DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX2] in {
defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
VR256, loadv4i64, i256mem, 0, SSEPackedInt,
DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
VEX_4V, VEX_L, VEX_WIG;
}
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>;
defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>;
defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
VR128, memopv2i64, i128mem, 1, SSEPackedInt,
SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>;
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
@ -7810,10 +7868,10 @@ let Predicates = [HasF16C, NoVLX] in {
// AVX2 Instructions
//===----------------------------------------------------------------------===//
/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
X86MemOperand x86memop> {
X86MemOperand x86memop, SDNodeXForm commuteXForm> {
let isCommutable = 1 in
def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@ -7829,12 +7887,19 @@ multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpVT (OpNode RC:$src1,
(bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
// Pattern to commute if load is in first source.
def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
RC:$src1, imm:$src3)),
(!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
(commuteXForm imm:$src3))>;
}
defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
VR128, loadv2i64, i128mem>;
defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
VR256, loadv4i64, i256mem>, VEX_L;
defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
VR128, loadv2i64, i128mem, BlendCommuteImm4>;
defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
VR256, loadv4i64, i256mem, BlendCommuteImm8>,
VEX_L;
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.