mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops
This patch simplifies pattern (xxswap (vec-op (xxswap a) (xxswap b))) into (vec-op a b) if vec-op is lane-insensitive. The motivating case is ScalarToVector-VecOp-ExtractElement sequence on LE, but the peephole itself is not related to endianness, so BE may also benefit from this. Reviewed By: nemanjai Differential Revision: https://reviews.llvm.org/D97658
This commit is contained in:
parent
f70507366d
commit
b78bd4345c
@ -6717,6 +6717,102 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
|
||||
CurDAG->RemoveDeadNodes();
|
||||
}
|
||||
|
||||
static bool isVSXSwap(SDValue N) {
|
||||
if (!N->isMachineOpcode())
|
||||
return false;
|
||||
unsigned Opc = N->getMachineOpcode();
|
||||
|
||||
// Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate
|
||||
// operand is 2.
|
||||
if (Opc == PPC::XXPERMDIs) {
|
||||
return isa<ConstantSDNode>(N->getOperand(1)) &&
|
||||
N->getConstantOperandVal(1) == 2;
|
||||
} else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) {
|
||||
return N->getOperand(0) == N->getOperand(1) &&
|
||||
isa<ConstantSDNode>(N->getOperand(2)) &&
|
||||
N->getConstantOperandVal(2) == 2;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: Make this complete and replace with a table-gen bit.
|
||||
static bool isLaneInsensitive(SDValue N) {
|
||||
if (!N->isMachineOpcode())
|
||||
return false;
|
||||
unsigned Opc = N->getMachineOpcode();
|
||||
|
||||
switch (Opc) {
|
||||
default:
|
||||
return false;
|
||||
case PPC::VAVGSB:
|
||||
case PPC::VAVGUB:
|
||||
case PPC::VAVGSH:
|
||||
case PPC::VAVGUH:
|
||||
case PPC::VAVGSW:
|
||||
case PPC::VAVGUW:
|
||||
case PPC::VMAXFP:
|
||||
case PPC::VMAXSB:
|
||||
case PPC::VMAXUB:
|
||||
case PPC::VMAXSH:
|
||||
case PPC::VMAXUH:
|
||||
case PPC::VMAXSW:
|
||||
case PPC::VMAXUW:
|
||||
case PPC::VMINFP:
|
||||
case PPC::VMINSB:
|
||||
case PPC::VMINUB:
|
||||
case PPC::VMINSH:
|
||||
case PPC::VMINUH:
|
||||
case PPC::VMINSW:
|
||||
case PPC::VMINUW:
|
||||
case PPC::VADDFP:
|
||||
case PPC::VADDUBM:
|
||||
case PPC::VADDUHM:
|
||||
case PPC::VADDUWM:
|
||||
case PPC::VSUBFP:
|
||||
case PPC::VSUBUBM:
|
||||
case PPC::VSUBUHM:
|
||||
case PPC::VSUBUWM:
|
||||
case PPC::VAND:
|
||||
case PPC::VANDC:
|
||||
case PPC::VOR:
|
||||
case PPC::VORC:
|
||||
case PPC::VXOR:
|
||||
case PPC::VNOR:
|
||||
case PPC::VMULUWM:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is
|
||||
// lane-insensitive.
|
||||
static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
|
||||
// Our desired xxswap might be source of COPY_TO_REGCLASS.
|
||||
// TODO: Can we put this a common method for DAG?
|
||||
auto SkipRCCopy = [](SDValue V) {
|
||||
while (V->isMachineOpcode() &&
|
||||
V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS)
|
||||
V = V->getOperand(0);
|
||||
return V;
|
||||
};
|
||||
|
||||
SDValue VecOp = SkipRCCopy(N->getOperand(0));
|
||||
if (!isLaneInsensitive(VecOp))
|
||||
return;
|
||||
|
||||
SDValue LHS = SkipRCCopy(VecOp.getOperand(0)),
|
||||
RHS = SkipRCCopy(VecOp.getOperand(1));
|
||||
if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) ||
|
||||
!isVSXSwap(RHS))
|
||||
return;
|
||||
|
||||
// These swaps may still have chain-uses here, count on dead code elimination
|
||||
// in following passes to remove them.
|
||||
DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0));
|
||||
DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0));
|
||||
DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
|
||||
}
|
||||
|
||||
void PPCDAGToDAGISel::PeepholePPC64() {
|
||||
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
|
||||
|
||||
@ -6726,6 +6822,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
|
||||
if (N->use_empty() || !N->isMachineOpcode())
|
||||
continue;
|
||||
|
||||
if (isVSXSwap(SDValue(N, 0)))
|
||||
reduceVSXSwap(N, CurDAG);
|
||||
|
||||
unsigned FirstOp;
|
||||
unsigned StorageOpcode = N->getMachineOpcode();
|
||||
bool RequiresMod4Offset = false;
|
||||
|
58
test/CodeGen/PowerPC/swap-reduction.ll
Normal file
58
test/CodeGen/PowerPC/swap-reduction.ll
Normal file
@ -0,0 +1,58 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le < %s | FileCheck %s
|
||||
|
||||
define i64 @test1(i64* %a, i64* %b) {
|
||||
; CHECK-LABEL: test1:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: mr 5, 3
|
||||
; CHECK-NEXT: ld 3, 0(3)
|
||||
; CHECK-NEXT: ld 4, 0(4)
|
||||
; CHECK-NEXT: mtvsrd 34, 3
|
||||
; CHECK-NEXT: add 3, 3, 4
|
||||
; CHECK-NEXT: mtvsrd 35, 4
|
||||
; CHECK-NEXT: vavgsb 2, 2, 3
|
||||
; CHECK-NEXT: stxsdx 34, 0, 5
|
||||
; CHECK-NEXT: blr
|
||||
entry:
|
||||
%lhs = load i64, i64* %a, align 8
|
||||
%rhs = load i64, i64* %b, align 8
|
||||
%sum = add i64 %lhs, %rhs
|
||||
%lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
|
||||
%rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
|
||||
%lhc = bitcast <2 x i64> %lv to <16 x i8>
|
||||
%rhc = bitcast <2 x i64> %rv to <16 x i8>
|
||||
%add = call <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8> %lhc, <16 x i8> %rhc)
|
||||
%cb = bitcast <16 x i8> %add to <2 x i64>
|
||||
%fv = extractelement <2 x i64> %cb, i32 0
|
||||
store i64 %fv, i64* %a, align 8
|
||||
ret i64 %sum
|
||||
}
|
||||
|
||||
define i64 @test2(i64* %a, i64* %b) {
|
||||
; CHECK-LABEL: test2:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: mr 5, 3
|
||||
; CHECK-NEXT: ld 3, 0(3)
|
||||
; CHECK-NEXT: ld 4, 0(4)
|
||||
; CHECK-NEXT: mtvsrd 34, 3
|
||||
; CHECK-NEXT: add 3, 3, 4
|
||||
; CHECK-NEXT: mtvsrd 35, 4
|
||||
; CHECK-NEXT: vadduhm 2, 2, 3
|
||||
; CHECK-NEXT: stxsdx 34, 0, 5
|
||||
; CHECK-NEXT: blr
|
||||
entry:
|
||||
%lhs = load i64, i64* %a, align 8
|
||||
%rhs = load i64, i64* %b, align 8
|
||||
%sum = add i64 %lhs, %rhs
|
||||
%lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
|
||||
%rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
|
||||
%lhc = bitcast <2 x i64> %lv to <8 x i16>
|
||||
%rhc = bitcast <2 x i64> %rv to <8 x i16>
|
||||
%add = add <8 x i16> %lhc, %rhc
|
||||
%cb = bitcast <8 x i16> %add to <2 x i64>
|
||||
%fv = extractelement <2 x i64> %cb, i32 0
|
||||
store i64 %fv, i64* %a, align 8
|
||||
ret i64 %sum
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>)
|
Loading…
Reference in New Issue
Block a user