mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[PowerPC] custom lower v2f64 fpext v2f32
Reduces scalarization overhead via custom lowering of v2f64 fpext v2f32. eg. For the following IR %0 = load <2 x float>, <2 x float>* %Ptr, align 8 %1 = fpext <2 x float> %0 to <2 x double> ret <2 x double> %1 Pre custom lowering: ld r3, 0(r3) mtvsrd f0, r3 xxswapd vs34, vs0 xscvspdpn f0, vs0 xxsldwi vs1, vs34, vs34, 3 xscvspdpn f1, vs1 xxmrghd vs34, vs0, vs1 After custom lowering: lfd f0, 0(r3) xxmrghw vs0, vs0, vs0 xvcvspdp vs34, vs0 Differential Revision: https://reviews.llvm.org/D57857 llvm-svn: 360429
This commit is contained in:
parent
b1f6bfbfa9
commit
d48de5e046
@ -877,6 +877,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
|
|||||||
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
|
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
|
||||||
setOperationAction(ISD::FREM, MVT::f128, Expand);
|
setOperationAction(ISD::FREM, MVT::f128, Expand);
|
||||||
}
|
}
|
||||||
|
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1378,6 +1379,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||||||
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
|
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
|
||||||
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
|
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
|
||||||
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
|
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
|
||||||
|
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
|
||||||
|
case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH";
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -9608,6 +9611,59 @@ SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
|
|||||||
return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
|
return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Custom lowering for fpext vf32 to v2f64
|
||||||
|
SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
|
||||||
|
|
||||||
|
assert(Op.getOpcode() == ISD::FP_EXTEND &&
|
||||||
|
"Should only be called for ISD::FP_EXTEND");
|
||||||
|
|
||||||
|
// We only want to custom lower an extend from v2f32 to v2f64.
|
||||||
|
if (Op.getValueType() != MVT::v2f64 ||
|
||||||
|
Op.getOperand(0).getValueType() != MVT::v2f32)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
SDLoc dl(Op);
|
||||||
|
SDValue Op0 = Op.getOperand(0);
|
||||||
|
|
||||||
|
switch (Op0.getOpcode()) {
|
||||||
|
default:
|
||||||
|
return SDValue();
|
||||||
|
case ISD::FADD:
|
||||||
|
case ISD::FMUL:
|
||||||
|
case ISD::FSUB: {
|
||||||
|
SDValue NewLoad[2];
|
||||||
|
for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
|
||||||
|
// Ensure both input are loads.
|
||||||
|
SDValue LdOp = Op0.getOperand(i);
|
||||||
|
if (LdOp.getOpcode() != ISD::LOAD)
|
||||||
|
return SDValue();
|
||||||
|
// Generate new load node.
|
||||||
|
LoadSDNode *LD = cast<LoadSDNode>(LdOp);
|
||||||
|
SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
|
||||||
|
NewLoad[i] =
|
||||||
|
DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
|
||||||
|
DAG.getVTList(MVT::v4f32, MVT::Other),
|
||||||
|
LoadOps, LD->getMemoryVT(),
|
||||||
|
LD->getMemOperand());
|
||||||
|
}
|
||||||
|
SDValue NewOp = DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32,
|
||||||
|
NewLoad[0], NewLoad[1],
|
||||||
|
Op0.getNode()->getFlags());
|
||||||
|
return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewOp);
|
||||||
|
}
|
||||||
|
case ISD::LOAD: {
|
||||||
|
LoadSDNode *LD = cast<LoadSDNode>(Op0);
|
||||||
|
SDValue LoadOps[] = { LD->getChain(), LD->getBasePtr() };
|
||||||
|
SDValue NewLd =
|
||||||
|
DAG.getMemIntrinsicNode(PPCISD::LD_VSX_LH, dl,
|
||||||
|
DAG.getVTList(MVT::v4f32, MVT::Other),
|
||||||
|
LoadOps, LD->getMemoryVT(), LD->getMemOperand());
|
||||||
|
return DAG.getNode(PPCISD::FP_EXTEND_LH, dl, MVT::v2f64, NewLd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
llvm_unreachable("ERROR:Should return for all cases within swtich.");
|
||||||
|
}
|
||||||
|
|
||||||
/// LowerOperation - Provide custom lowering hooks for some operations.
|
/// LowerOperation - Provide custom lowering hooks for some operations.
|
||||||
///
|
///
|
||||||
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||||
@ -9661,6 +9717,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||||||
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
|
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
|
||||||
case ISD::MUL: return LowerMUL(Op, DAG);
|
case ISD::MUL: return LowerMUL(Op, DAG);
|
||||||
case ISD::ABS: return LowerABS(Op, DAG);
|
case ISD::ABS: return LowerABS(Op, DAG);
|
||||||
|
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
|
||||||
|
|
||||||
// For counter-based loop handling.
|
// For counter-based loop handling.
|
||||||
case ISD::INTRINSIC_W_CHAIN: return SDValue();
|
case ISD::INTRINSIC_W_CHAIN: return SDValue();
|
||||||
|
@ -404,6 +404,9 @@ namespace llvm {
|
|||||||
/// representation.
|
/// representation.
|
||||||
QBFLT,
|
QBFLT,
|
||||||
|
|
||||||
|
/// Custom extend v4f32 to v2f64.
|
||||||
|
FP_EXTEND_LH,
|
||||||
|
|
||||||
/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
|
/// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
|
||||||
/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
|
/// byte-swapping store instruction. It byte-swaps the low "Type" bits of
|
||||||
/// the GPRC input, then stores it through Ptr. Type can be either i16 or
|
/// the GPRC input, then stores it through Ptr. Type can be either i16 or
|
||||||
@ -445,6 +448,10 @@ namespace llvm {
|
|||||||
/// an xxswapd.
|
/// an xxswapd.
|
||||||
LXVD2X,
|
LXVD2X,
|
||||||
|
|
||||||
|
/// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
|
||||||
|
/// v2f32 value into the lower half of a VSR register.
|
||||||
|
LD_VSX_LH,
|
||||||
|
|
||||||
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
|
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
|
||||||
/// Maps directly to an stxvd2x instruction that will be preceded by
|
/// Maps directly to an stxvd2x instruction that will be preceded by
|
||||||
/// an xxswapd.
|
/// an xxswapd.
|
||||||
@ -1021,6 +1028,7 @@ namespace llvm {
|
|||||||
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
|
||||||
|
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
|
||||||
|
|
||||||
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
|
||||||
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
|
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
|
||||||
|
@ -53,6 +53,15 @@ def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
|
|||||||
def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
|
def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
|
||||||
let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
|
let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def SDT_PPCldvsxlh : SDTypeProfile<1, 1, [
|
||||||
|
SDTCisVT<0, v4f32>, SDTCisPtrTy<1>
|
||||||
|
]>;
|
||||||
|
|
||||||
|
def SDT_PPCfpextlh : SDTypeProfile<1, 1, [
|
||||||
|
SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>
|
||||||
|
]>;
|
||||||
|
|
||||||
// Little-endian-specific nodes.
|
// Little-endian-specific nodes.
|
||||||
def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
|
def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
|
||||||
SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
|
SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
|
||||||
@ -84,6 +93,10 @@ def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
|
|||||||
def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
|
def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
|
||||||
def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
|
def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
|
||||||
|
|
||||||
|
def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>;
|
||||||
|
def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
|
||||||
|
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
||||||
|
|
||||||
multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
|
multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
|
||||||
string asmstr, InstrItinClass itin, Intrinsic Int,
|
string asmstr, InstrItinClass itin, Intrinsic Int,
|
||||||
ValueType OutTy, ValueType InTy> {
|
ValueType OutTy, ValueType InTy> {
|
||||||
@ -1060,6 +1073,8 @@ def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
|
|||||||
def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
|
def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
|
||||||
(v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
|
(v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
|
||||||
|
|
||||||
|
def : Pat<(v2f64 (PPCfpextlh v4f32:$C)), (XVCVSPDP (XXMRGHW $C, $C))>;
|
||||||
|
|
||||||
// Loads.
|
// Loads.
|
||||||
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
|
let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
|
||||||
def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
|
def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
|
||||||
@ -3266,6 +3281,10 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
|
|||||||
def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
|
def : Pat<(f32 (fpround (f64 (extloadf32 ixaddr:$src)))),
|
||||||
(f32 (DFLOADf32 ixaddr:$src))>;
|
(f32 (DFLOADf32 ixaddr:$src))>;
|
||||||
|
|
||||||
|
def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)),
|
||||||
|
(COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>;
|
||||||
|
def : Pat<(v4f32 (PPCldvsxlh ixaddr:$src)),
|
||||||
|
(COPY_TO_REGCLASS (DFLOADf64 ixaddr:$src), VSRC)>;
|
||||||
|
|
||||||
let AddedComplexity = 400 in {
|
let AddedComplexity = 400 in {
|
||||||
// The following pseudoinstructions are used to ensure the utilization
|
// The following pseudoinstructions are used to ensure the utilization
|
||||||
|
77
test/CodeGen/PowerPC/reduce_scalarization.ll
Normal file
77
test/CodeGen/PowerPC/reduce_scalarization.ll
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
|
||||||
|
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \
|
||||||
|
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
|
||||||
|
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
|
||||||
|
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names \
|
||||||
|
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readonly
|
||||||
|
define dso_local <2 x double> @test1(<2 x float>* nocapture readonly %Ptr) {
|
||||||
|
; CHECK-LABEL: test1:
|
||||||
|
; CHECK: # %bb.0: # %entry
|
||||||
|
; CHECK-NEXT: lfd f0, 0(r3)
|
||||||
|
; CHECK-NEXT: xxmrghw vs0, vs0, vs0
|
||||||
|
; CHECK-NEXT: xvcvspdp v2, vs0
|
||||||
|
; CHECK-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%0 = load <2 x float>, <2 x float>* %Ptr, align 8
|
||||||
|
%1 = fpext <2 x float> %0 to <2 x double>
|
||||||
|
ret <2 x double> %1
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readonly
|
||||||
|
define dso_local <2 x double> @test2(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) {
|
||||||
|
; CHECK-LABEL: test2:
|
||||||
|
; CHECK: # %bb.0: # %entry
|
||||||
|
; CHECK-NEXT: lfd f0, 0(r4)
|
||||||
|
; CHECK-NEXT: lfd f1, 0(r3)
|
||||||
|
; CHECK-NEXT: xvsubsp vs0, vs1, vs0
|
||||||
|
; CHECK-NEXT: xxmrghw vs0, vs0, vs0
|
||||||
|
; CHECK-NEXT: xvcvspdp v2, vs0
|
||||||
|
; CHECK-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%0 = load <2 x float>, <2 x float>* %a, align 8
|
||||||
|
%1 = load <2 x float>, <2 x float>* %b, align 8
|
||||||
|
%sub = fsub <2 x float> %0, %1
|
||||||
|
%2 = fpext <2 x float> %sub to <2 x double>
|
||||||
|
ret <2 x double> %2
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readonly
|
||||||
|
; Function Attrs: norecurse nounwind readonly
|
||||||
|
define dso_local <2 x double> @test3(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) {
|
||||||
|
; CHECK-LABEL: test3:
|
||||||
|
; CHECK: # %bb.0: # %entry
|
||||||
|
; CHECK-NEXT: lfd f0, 0(r4)
|
||||||
|
; CHECK-NEXT: lfd f1, 0(r3)
|
||||||
|
; CHECK-NEXT: xvaddsp vs0, vs1, vs0
|
||||||
|
; CHECK-NEXT: xxmrghw vs0, vs0, vs0
|
||||||
|
; CHECK-NEXT: xvcvspdp v2, vs0
|
||||||
|
; CHECK-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%0 = load <2 x float>, <2 x float>* %a, align 8
|
||||||
|
%1 = load <2 x float>, <2 x float>* %b, align 8
|
||||||
|
%sub = fadd <2 x float> %0, %1
|
||||||
|
%2 = fpext <2 x float> %sub to <2 x double>
|
||||||
|
ret <2 x double> %2
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readonly
|
||||||
|
; Function Attrs: norecurse nounwind readonly
|
||||||
|
define dso_local <2 x double> @test4(<2 x float>* nocapture readonly %a, <2 x float>* nocapture readonly %b) {
|
||||||
|
; CHECK-LABEL: test4:
|
||||||
|
; CHECK: # %bb.0: # %entry
|
||||||
|
; CHECK-NEXT: lfd f0, 0(r4)
|
||||||
|
; CHECK-NEXT: lfd f1, 0(r3)
|
||||||
|
; CHECK-NEXT: xvmulsp vs0, vs1, vs0
|
||||||
|
; CHECK-NEXT: xxmrghw vs0, vs0, vs0
|
||||||
|
; CHECK-NEXT: xvcvspdp v2, vs0
|
||||||
|
; CHECK-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%0 = load <2 x float>, <2 x float>* %a, align 8
|
||||||
|
%1 = load <2 x float>, <2 x float>* %b, align 8
|
||||||
|
%sub = fmul <2 x float> %0, %1
|
||||||
|
%2 = fpext <2 x float> %sub to <2 x double>
|
||||||
|
ret <2 x double> %2
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user