From 2a324415cd3347aeb5f184a5f1ffa413956d1da2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 17 Sep 2017 05:06:05 +0000 Subject: [PATCH] [X86] Add patterns to make blends with immediate control commutable during isel for load folding. llvm-svn: 313476 --- lib/Target/X86/X86InstrSSE.td | 147 ++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 41 deletions(-) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a4dc64975cc..77a0c433841 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6542,6 +6542,21 @@ multiclass SS41I_binop_rmi opc, string OpcodeStr, SDNode OpNode, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +def BlendCommuteImm2 : SDNodeXFormgetZExtValue() & 0x03; + return getI8Imm(Imm ^ 0x03, SDLoc(N)); +}]>; + +def BlendCommuteImm4 : SDNodeXFormgetZExtValue() & 0x0f; + return getI8Imm(Imm ^ 0x0f, SDLoc(N)); +}]>; + +def BlendCommuteImm8 : SDNodeXFormgetZExtValue() & 0xff; + return getI8Imm(Imm ^ 0xff, SDLoc(N)); +}]>; + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -6549,26 +6564,6 @@ let Predicates = [HasAVX] in { DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG; } - let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG; - defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, - VR256, loadv8f32, f256mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; - } - let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG; - defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, - VR256, loadv4f64, f256mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; - } - defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG; - let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, @@ -6589,9 +6584,6 @@ let Predicates = [HasAVX2] in { VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG; } - defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -6600,17 +6592,7 @@ let Constraints = "$src1 = $dst" in { VR128, memopv2i64, i128mem, 1, SSE_MPSADBW_ITINS>; } - let ExeDomain = SSEPackedSingle in - defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, - VR128, memopv4f32, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - let ExeDomain = SSEPackedDouble in - defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, - VR128, memopv2f64, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, - VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_BLEND_P>; + let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -6621,6 +6603,82 @@ let Constraints = "$src1 = $dst" in { SSE_DPPD_ITINS>; } +/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate +multiclass SS41I_blend_rmi opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr, Domain d, + OpndItins itins, SDNodeXForm commuteXForm> { +let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { + let isCommutable = 1 in + def rri : SS4AIi8, Sched<[itins.Sched]>; + def rmi : SS4AIi8, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + + // Pattern to commute if load is in first source. + def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), + RC:$src1, imm:$src3)), + (!cast(NAME#"rmi") RC:$src1, addr:$src2, + (commuteXForm imm:$src3))>; +} + +let Predicates = [HasAVX] in { + defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, SSEPackedSingle, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>, + VEX_4V, VEX_WIG; + defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, SSEPackedSingle, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>, + VEX_4V, VEX_L, VEX_WIG; + defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, SSEPackedDouble, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>, + VEX_4V, VEX_WIG; + defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, SSEPackedDouble, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>, + VEX_4V, VEX_L, VEX_WIG; + defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, SSEPackedInt, + DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>, + VEX_4V, VEX_WIG; +} + +let Predicates = [HasAVX2] in { + defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, SSEPackedInt, + DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>, + VEX_4V, VEX_L, VEX_WIG; +} + +defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, 1, SSEPackedSingle, + SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>; +defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, 1, SSEPackedDouble, + SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>; +defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, 1, SSEPackedInt, + SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>; + // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { @@ -7810,10 +7868,10 @@ let Predicates = [HasF16C, NoVLX] in { // AVX2 Instructions //===----------------------------------------------------------------------===// -/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate -multiclass AVX2_binop_rmi opc, string OpcodeStr, SDNode OpNode, +/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate +multiclass AVX2_blend_rmi opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { + X86MemOperand x86memop, SDNodeXForm commuteXForm> { let isCommutable = 1 in def rri : AVX2AIi8 opc, string OpcodeStr, SDNode OpNode, (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; + + // Pattern to commute if load is in first source. + def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), + RC:$src1, imm:$src3)), + (!cast(NAME#"rmi") RC:$src1, addr:$src2, + (commuteXForm imm:$src3))>; } -defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, - VR128, loadv2i64, i128mem>; -defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, - VR256, loadv4i64, i256mem>, VEX_L; +defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, + VR128, loadv2i64, i128mem, BlendCommuteImm4>; +defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, + VR256, loadv4i64, i256mem, BlendCommuteImm8>, + VEX_L; // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128.