mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
[X86] Post process the DAG after isel to remove vector moves that were added to zero upper bits.
We previously avoided inserting these moves during isel in a few cases which is implemented using a whitelist of opcodes. But it's too difficult to generate a perfect list of opcodes to whitelist. Especially with AVX512F without AVX512VL using 512 bit vectors to implement some 128/256 bit operations. Since isel is done bottoms up, we'd have to check the VT and opcode and subtarget in order to determine whether an EXTRACT_SUBREG would be generated for some operations. So instead of doing that, this patch adds a post processing step that detects when the moves are unnecesssary after isel. At that point any EXTRACT_SUBREGs would have already been created and appear in the DAG. So then we just need to ensure the input to the move isn't one. Differential Revision: https://reviews.llvm.org/D44289 llvm-svn: 327724
This commit is contained in:
parent
dcb1f31f28
commit
9690e38cf6
@ -181,6 +181,7 @@ namespace {
|
||||
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
|
||||
|
||||
void PreprocessISelDAG() override;
|
||||
void PostprocessISelDAG() override;
|
||||
|
||||
// Include the pieces autogenerated from the target description.
|
||||
#include "X86GenDAGISel.inc"
|
||||
@ -752,6 +753,70 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
|
||||
}
|
||||
|
||||
|
||||
void X86DAGToDAGISel::PostprocessISelDAG() {
|
||||
// Skip peepholes at -O0.
|
||||
if (TM.getOptLevel() == CodeGenOpt::None)
|
||||
return;
|
||||
|
||||
// Attempt to remove vectors moves that were inserted to zero upper bits.
|
||||
|
||||
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
||||
++Position;
|
||||
|
||||
while (Position != CurDAG->allnodes_begin()) {
|
||||
SDNode *N = &*--Position;
|
||||
// Skip dead nodes and any non-machine opcodes.
|
||||
if (N->use_empty() || !N->isMachineOpcode())
|
||||
continue;
|
||||
|
||||
if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
|
||||
continue;
|
||||
|
||||
unsigned SubRegIdx = N->getConstantOperandVal(2);
|
||||
if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
|
||||
continue;
|
||||
|
||||
SDValue Move = N->getOperand(1);
|
||||
if (!Move.isMachineOpcode())
|
||||
continue;
|
||||
|
||||
// Make sure its one of the move opcodes we recognize.
|
||||
switch (Move.getMachineOpcode()) {
|
||||
default:
|
||||
continue;
|
||||
case X86::VMOVAPDrr: case X86::VMOVUPDrr:
|
||||
case X86::VMOVAPSrr: case X86::VMOVUPSrr:
|
||||
case X86::VMOVDQArr: case X86::VMOVDQUrr:
|
||||
case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
|
||||
case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
|
||||
case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
|
||||
case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
|
||||
case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
|
||||
case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
|
||||
case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
|
||||
case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
|
||||
case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
|
||||
case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
|
||||
case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
|
||||
break;
|
||||
}
|
||||
|
||||
SDValue In = Move.getOperand(0);
|
||||
if (!In.isMachineOpcode() ||
|
||||
In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
|
||||
continue;
|
||||
|
||||
// Producing instruction is another vector instruction. We can drop the
|
||||
// move.
|
||||
CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
|
||||
|
||||
// If the move is now dead, delete it.
|
||||
if (Move.getNode()->use_empty())
|
||||
CurDAG->RemoveDeadNode(Move.getNode());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Emit any code that needs to be executed only in the main function.
|
||||
void X86DAGToDAGISel::emitSpecialCodeForMain() {
|
||||
if (Subtarget->isTargetCygMing()) {
|
||||
|
@ -360,67 +360,6 @@ let Predicates = [HasAVX512, NoVLX] in {
|
||||
v16i32, loadv4i64, sub_ymm>;
|
||||
}
|
||||
|
||||
// List of opcodes that guaranteed to zero the upper elements of vector regs.
|
||||
// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
|
||||
// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
|
||||
// this difficult. So starting with a couple opcodes used by reduction loops
|
||||
// where we explicitly insert zeros.
|
||||
class veczeroupper<ValueType vt, RegisterClass RC> :
|
||||
PatLeaf<(vt RC:$src), [{
|
||||
return N->getOpcode() == X86ISD::VPMADDWD ||
|
||||
N->getOpcode() == X86ISD::PSADBW;
|
||||
}]>;
|
||||
|
||||
def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
|
||||
def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
|
||||
def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
|
||||
def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
|
||||
def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
|
||||
def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
|
||||
|
||||
def zeroupperv4f64 : veczeroupper<v4f64, VR256>;
|
||||
def zeroupperv8f32 : veczeroupper<v8f32, VR256>;
|
||||
def zeroupperv4i64 : veczeroupper<v4i64, VR256>;
|
||||
def zeroupperv8i32 : veczeroupper<v8i32, VR256>;
|
||||
def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
|
||||
def zeroupperv32i8 : veczeroupper<v32i8, VR256>;
|
||||
|
||||
|
||||
// If we can guarantee the upper elements have already been zeroed we can elide
|
||||
// an explicit zeroing.
|
||||
multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
|
||||
ValueType SrcTy, ValueType ZeroTy,
|
||||
SubRegIndex SubIdx, PatLeaf Zeroupper> {
|
||||
def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
|
||||
Zeroupper:$src, (iPTR 0))),
|
||||
(SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
|
||||
}
|
||||
|
||||
// 128->256
|
||||
defm: subvector_zero_ellision<VR128, v4f64, v2f64, v8i32, sub_xmm, zeroupperv2f64>;
|
||||
defm: subvector_zero_ellision<VR128, v8f32, v4f32, v8i32, sub_xmm, zeroupperv4f32>;
|
||||
defm: subvector_zero_ellision<VR128, v4i64, v2i64, v8i32, sub_xmm, zeroupperv2i64>;
|
||||
defm: subvector_zero_ellision<VR128, v8i32, v4i32, v8i32, sub_xmm, zeroupperv4i32>;
|
||||
defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
|
||||
defm: subvector_zero_ellision<VR128, v32i8, v16i8, v8i32, sub_xmm, zeroupperv16i8>;
|
||||
|
||||
// 128->512
|
||||
defm: subvector_zero_ellision<VR128, v8f64, v2f64, v16i32, sub_xmm, zeroupperv2f64>;
|
||||
defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
|
||||
defm: subvector_zero_ellision<VR128, v8i64, v2i64, v16i32, sub_xmm, zeroupperv2i64>;
|
||||
defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
|
||||
defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
|
||||
defm: subvector_zero_ellision<VR128, v64i8, v16i8, v16i32, sub_xmm, zeroupperv16i8>;
|
||||
|
||||
// 256->512
|
||||
defm: subvector_zero_ellision<VR256, v8f64, v4f64, v16i32, sub_ymm, zeroupperv4f64>;
|
||||
defm: subvector_zero_ellision<VR256, v16f32, v8f32, v16i32, sub_ymm, zeroupperv8f32>;
|
||||
defm: subvector_zero_ellision<VR256, v8i64, v4i64, v16i32, sub_ymm, zeroupperv4i64>;
|
||||
defm: subvector_zero_ellision<VR256, v16i32, v8i32, v16i32, sub_ymm, zeroupperv8i32>;
|
||||
defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
|
||||
defm: subvector_zero_ellision<VR256, v64i8, v32i8, v16i32, sub_ymm, zeroupperv32i8>;
|
||||
|
||||
|
||||
class maskzeroupper<ValueType vt, RegisterClass RC> :
|
||||
PatLeaf<(vt RC:$src), [{
|
||||
return isMaskZeroExtended(N);
|
||||
|
@ -507,7 +507,6 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
|
||||
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
|
||||
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
|
||||
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512BW-NEXT: vmovdqu %ymm1, (%rax)
|
||||
|
@ -658,7 +658,6 @@ define i64 @v16i8_widened_with_zeroes(<16 x i8> %a, <16 x i8> %b) {
|
||||
; AVX2-LABEL: v16i8_widened_with_zeroes:
|
||||
; AVX2: # %bb.0: # %entry
|
||||
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovdqa %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpmovmskb %ymm0, %eax
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -597,7 +597,6 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
||||
; AVX-NEXT: vmovapd %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile:
|
||||
@ -605,7 +604,6 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable
|
||||
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
||||
; X32-AVX-NEXT: vmovapd %xmm0, %xmm0
|
||||
; X32-AVX-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds double, double* %ptr, i64 3
|
||||
%ptr1 = getelementptr inbounds double, double* %ptr, i64 4
|
||||
|
@ -136,7 +136,6 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
|
||||
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
||||
; ALL-NEXT: vmovaps %xmm0, %xmm0
|
||||
; ALL-NEXT: movq %rbp, %rsp
|
||||
; ALL-NEXT: popq %rbp
|
||||
; ALL-NEXT: retq
|
||||
|
Loading…
Reference in New Issue
Block a user