1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

[PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops

This patch simplifies pattern (xxswap (vec-op (xxswap a) (xxswap b)))
into (vec-op a b) if vec-op is lane-insensitive. The motivating case
is ScalarToVector-VecOp-ExtractElement sequence on LE, but the
peephole itself is not related to endianness, so BE may also benefit
from this.

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D97658
This commit is contained in:
Qiu Chaofan 2021-03-10 15:21:32 +08:00
parent f70507366d
commit b78bd4345c
2 changed files with 157 additions and 0 deletions

View File

@ -6717,6 +6717,102 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
CurDAG->RemoveDeadNodes();
}
static bool isVSXSwap(SDValue N) {
if (!N->isMachineOpcode())
return false;
unsigned Opc = N->getMachineOpcode();
// Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate
// operand is 2.
if (Opc == PPC::XXPERMDIs) {
return isa<ConstantSDNode>(N->getOperand(1)) &&
N->getConstantOperandVal(1) == 2;
} else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) {
return N->getOperand(0) == N->getOperand(1) &&
isa<ConstantSDNode>(N->getOperand(2)) &&
N->getConstantOperandVal(2) == 2;
}
return false;
}
// TODO: Make this complete and replace with a table-gen bit.
static bool isLaneInsensitive(SDValue N) {
if (!N->isMachineOpcode())
return false;
unsigned Opc = N->getMachineOpcode();
switch (Opc) {
default:
return false;
case PPC::VAVGSB:
case PPC::VAVGUB:
case PPC::VAVGSH:
case PPC::VAVGUH:
case PPC::VAVGSW:
case PPC::VAVGUW:
case PPC::VMAXFP:
case PPC::VMAXSB:
case PPC::VMAXUB:
case PPC::VMAXSH:
case PPC::VMAXUH:
case PPC::VMAXSW:
case PPC::VMAXUW:
case PPC::VMINFP:
case PPC::VMINSB:
case PPC::VMINUB:
case PPC::VMINSH:
case PPC::VMINUH:
case PPC::VMINSW:
case PPC::VMINUW:
case PPC::VADDFP:
case PPC::VADDUBM:
case PPC::VADDUHM:
case PPC::VADDUWM:
case PPC::VSUBFP:
case PPC::VSUBUBM:
case PPC::VSUBUHM:
case PPC::VSUBUWM:
case PPC::VAND:
case PPC::VANDC:
case PPC::VOR:
case PPC::VORC:
case PPC::VXOR:
case PPC::VNOR:
case PPC::VMULUWM:
return true;
}
}
// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is
// lane-insensitive.
static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
// Our desired xxswap might be source of COPY_TO_REGCLASS.
// TODO: Can we put this a common method for DAG?
auto SkipRCCopy = [](SDValue V) {
while (V->isMachineOpcode() &&
V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS)
V = V->getOperand(0);
return V;
};
SDValue VecOp = SkipRCCopy(N->getOperand(0));
if (!isLaneInsensitive(VecOp))
return;
SDValue LHS = SkipRCCopy(VecOp.getOperand(0)),
RHS = SkipRCCopy(VecOp.getOperand(1));
if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) ||
!isVSXSwap(RHS))
return;
// These swaps may still have chain-uses here, count on dead code elimination
// in following passes to remove them.
DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0));
DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0));
DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
}
void PPCDAGToDAGISel::PeepholePPC64() {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
@ -6726,6 +6822,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
if (N->use_empty() || !N->isMachineOpcode())
continue;
if (isVSXSwap(SDValue(N, 0)))
reduceVSXSwap(N, CurDAG);
unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
bool RequiresMod4Offset = false;

View File

@ -0,0 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le < %s | FileCheck %s
define i64 @test1(i64* %a, i64* %b) {
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mr 5, 3
; CHECK-NEXT: ld 3, 0(3)
; CHECK-NEXT: ld 4, 0(4)
; CHECK-NEXT: mtvsrd 34, 3
; CHECK-NEXT: add 3, 3, 4
; CHECK-NEXT: mtvsrd 35, 4
; CHECK-NEXT: vavgsb 2, 2, 3
; CHECK-NEXT: stxsdx 34, 0, 5
; CHECK-NEXT: blr
entry:
%lhs = load i64, i64* %a, align 8
%rhs = load i64, i64* %b, align 8
%sum = add i64 %lhs, %rhs
%lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
%rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
%lhc = bitcast <2 x i64> %lv to <16 x i8>
%rhc = bitcast <2 x i64> %rv to <16 x i8>
%add = call <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8> %lhc, <16 x i8> %rhc)
%cb = bitcast <16 x i8> %add to <2 x i64>
%fv = extractelement <2 x i64> %cb, i32 0
store i64 %fv, i64* %a, align 8
ret i64 %sum
}
define i64 @test2(i64* %a, i64* %b) {
; CHECK-LABEL: test2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mr 5, 3
; CHECK-NEXT: ld 3, 0(3)
; CHECK-NEXT: ld 4, 0(4)
; CHECK-NEXT: mtvsrd 34, 3
; CHECK-NEXT: add 3, 3, 4
; CHECK-NEXT: mtvsrd 35, 4
; CHECK-NEXT: vadduhm 2, 2, 3
; CHECK-NEXT: stxsdx 34, 0, 5
; CHECK-NEXT: blr
entry:
%lhs = load i64, i64* %a, align 8
%rhs = load i64, i64* %b, align 8
%sum = add i64 %lhs, %rhs
%lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
%rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
%lhc = bitcast <2 x i64> %lv to <8 x i16>
%rhc = bitcast <2 x i64> %rv to <8 x i16>
%add = add <8 x i16> %lhc, %rhc
%cb = bitcast <8 x i16> %add to <2 x i64>
%fv = extractelement <2 x i64> %cb, i32 0
store i64 %fv, i64* %a, align 8
ret i64 %sum
}
declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>)