mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[P10] [Power PC] Exploiting new load rightmost vector element instructions.
This pull request implements patterns to exploit the load rightmost vector element instructions for loading element 0 on little endian PowerPC subtargets into v8i16 and v16i8 vector registers for i16 and i8 data types. Differential Revision: https://reviews.llvm.org/D94816#inline-921403
This commit is contained in:
parent
0ac1e53c7e
commit
b9ad035427
@ -2563,6 +2563,11 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
|
||||
(STXVRDX $src, xoaddr:$dst)>;
|
||||
def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst),
|
||||
(STXVRDX $src, xoaddr:$dst)>;
|
||||
// Load element 0 of a VSX register to memory
|
||||
def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 xoaddr:$src)))),
|
||||
(v8i16 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VSRC))>;
|
||||
def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 xoaddr:$src)))),
|
||||
(v16i8 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VSRC))>;
|
||||
}
|
||||
|
||||
// FIXME: The swap is overkill when the shift amount is a constant.
|
||||
|
@ -152,6 +152,7 @@ def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">;
|
||||
def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">;
|
||||
def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
|
||||
def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
|
||||
def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
|
||||
|
||||
//--------------------- VSX-specific instruction formats ---------------------//
|
||||
// By default, all VSX instructions are to be selected over their Altivec
|
||||
@ -2437,6 +2438,8 @@ def MrgWords {
|
||||
// [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
|
||||
// [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
|
||||
// [HasVSX, HasP9Vector]
|
||||
// [HasVSX, HasP9Vector, NoP10Vector]
|
||||
// [HasVSX, HasP9Vector, IsBigEndian]
|
||||
// [HasVSX, HasP9Vector, IsBigEndian, IsPPC64]
|
||||
// [HasVSX, HasP9Vector, IsLittleEndian]
|
||||
// [HasVSX, HasP9Altivec]
|
||||
@ -3735,9 +3738,6 @@ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
|
||||
(STXVX $rS, xoaddr:$dst)>;
|
||||
|
||||
// Build vectors from i8 loads
|
||||
defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
|
||||
(VSPLTBs 7, (LXSIBZX xoaddr:$src)),
|
||||
(VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
|
||||
defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8,
|
||||
(VSPLTHs 3, (LXSIBZX xoaddr:$src)),
|
||||
(VSPLTHs 3, (LXSIBZX xoaddr:$src))>;
|
||||
@ -3755,9 +3755,6 @@ defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64,
|
||||
(XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>;
|
||||
|
||||
// Build vectors from i16 loads
|
||||
defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
|
||||
(VSPLTHs 3, (LXSIHZX xoaddr:$src)),
|
||||
(VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
|
||||
defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16,
|
||||
(XXSPLTWs (LXSIHZX xoaddr:$src), 1),
|
||||
(XXSPLTWs (LXSIHZX xoaddr:$src), 1)>;
|
||||
@ -3955,6 +3952,38 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
|
||||
(v4i32 (LXVWSX xoaddr:$A))>;
|
||||
} // HasVSX, HasP9Vector
|
||||
|
||||
// Any Power9 VSX subtarget with equivalent length but better Power10 VSX
|
||||
// patterns.
|
||||
// Two identical blocks are required due to the slightly different predicates:
|
||||
// One without P10 instructions, the other is BigEndian only with P10 instructions.
|
||||
let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in {
|
||||
// Little endian Power10 subtargets produce a shorter pattern but require a
|
||||
// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions
|
||||
// to perform the operation, when only one instruction is produced in practice.
|
||||
// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets.
|
||||
defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
|
||||
(VSPLTBs 7, (LXSIBZX xoaddr:$src)),
|
||||
(VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
|
||||
// Build vectors from i16 loads
|
||||
defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
|
||||
(VSPLTHs 3, (LXSIHZX xoaddr:$src)),
|
||||
(VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
|
||||
} // HasVSX, HasP9Vector, NoP10Vector
|
||||
|
||||
// Any big endian Power9 VSX subtarget
|
||||
let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
|
||||
// Power10 VSX subtargets produce a shorter pattern for little endian targets
|
||||
// but this is still the best pattern for Power9 and Power10 VSX big endian
|
||||
// Build vectors from i8 loads
|
||||
defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
|
||||
(VSPLTBs 7, (LXSIBZX xoaddr:$src)),
|
||||
(VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
|
||||
// Build vectors from i16 loads
|
||||
defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
|
||||
(VSPLTHs 3, (LXSIHZX xoaddr:$src)),
|
||||
(VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
|
||||
} // HasVSX, HasP9Vector, NoP10Vector
|
||||
|
||||
// Big endian 64Bit Power9 subtarget.
|
||||
let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
|
||||
def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
|
||||
|
63
test/CodeGen/PowerPC/load-rightmost-vector-elt.ll
Normal file
63
test/CodeGen/PowerPC/load-rightmost-vector-elt.ll
Normal file
@ -0,0 +1,63 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
|
||||
; RUN: -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
|
||||
; RUN: < %s | FileCheck %s --check-prefix=CHECK-P10LE
|
||||
|
||||
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
|
||||
; RUN: -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
|
||||
; RUN: < %s | FileCheck %s --check-prefix=CHECK-P10BE
|
||||
|
||||
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
|
||||
; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
|
||||
; RUN: < %s | FileCheck %s --check-prefix=CHECK-P9
|
||||
|
||||
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
|
||||
; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
|
||||
; RUN: < %s | FileCheck %s --check-prefix=CHECK-P9
|
||||
|
||||
define <8 x i16> @test1(i16* %a) {
|
||||
; CHECK-P10LE-LABEL: test1:
|
||||
; CHECK-P10LE: # %bb.0: # %entry
|
||||
; CHECK-P10LE-NEXT: lxvrhx v2, 0, r3
|
||||
; CHECK-P10LE-NEXT: blr
|
||||
;
|
||||
; CHECK-P10BE-LABEL: test1:
|
||||
; CHECK-P10BE: # %bb.0: # %entry
|
||||
; CHECK-P10BE-NEXT: lxsihzx v2, 0, r3
|
||||
; CHECK-P10BE-NEXT: vsplth v2, v2, 3
|
||||
; CHECK-P10BE-NEXT: blr
|
||||
;
|
||||
; CHECK-P9-LABEL: test1:
|
||||
; CHECK-P9: # %bb.0: # %entry
|
||||
; CHECK-P9-NEXT: lxsihzx v2, 0, r3
|
||||
; CHECK-P9-NEXT: vsplth v2, v2, 3
|
||||
; CHECK-P9-NEXT: blr
|
||||
entry:
|
||||
%0 = load i16, i16* %a, align 2
|
||||
%vecinit = insertelement <8 x i16> undef, i16 %0, i32 0
|
||||
ret <8 x i16> %vecinit
|
||||
}
|
||||
|
||||
define <16 x i8> @test2(i8* %a) {
|
||||
; CHECK-P10LE-LABEL: test2:
|
||||
; CHECK-P10LE: # %bb.0: # %entry
|
||||
; CHECK-P10LE-NEXT: lxvrbx v2, 0, r3
|
||||
; CHECK-P10LE-NEXT: blr
|
||||
;
|
||||
; CHECK-P10BE-LABEL: test2:
|
||||
; CHECK-P10BE: # %bb.0: # %entry
|
||||
; CHECK-P10BE-NEXT: lxsibzx v2, 0, r3
|
||||
; CHECK-P10BE-NEXT: vspltb v2, v2, 7
|
||||
; CHECK-P10BE-NEXT: blr
|
||||
;
|
||||
; CHECK-P9-LABEL: test2:
|
||||
; CHECK-P9: # %bb.0: # %entry
|
||||
; CHECK-P9-NEXT: lxsibzx v2, 0, r3
|
||||
; CHECK-P9-NEXT: vspltb v2, v2, 7
|
||||
; CHECK-P9-NEXT: blr
|
||||
entry:
|
||||
%0 = load i8, i8* %a, align 1
|
||||
%vecins = insertelement <16 x i8> undef, i8 %0, i32 0
|
||||
ret <16 x i8> %vecins
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user