mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 11:42:57 +01:00
Enabling the generation of dependency breakers for partial updates on Cortex-A15. Also fixing a small bug in getting the update clearence for VLD1LNd32.
llvm-svn: 178134
This commit is contained in:
parent
d52d76ad2e
commit
bd61af84a7
@ -3734,9 +3734,9 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
|
||||
if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
|
||||
return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
|
||||
|
||||
// A9-like cores are particularly picky about mixing the two and want these
|
||||
// CortexA9 is particularly picky about mixing the two and wants these
|
||||
// converted.
|
||||
if (Subtarget.isLikeA9() && !isPredicated(MI) &&
|
||||
if (Subtarget.isCortexA9() && !isPredicated(MI) &&
|
||||
(MI->getOpcode() == ARM::VMOVRS ||
|
||||
MI->getOpcode() == ARM::VMOVSR ||
|
||||
MI->getOpcode() == ARM::VMOVS))
|
||||
@ -4023,14 +4023,12 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
|
||||
// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
|
||||
//
|
||||
// FCONSTD can be used as a dependency-breaking instruction.
|
||||
|
||||
|
||||
unsigned ARMBaseInstrInfo::
|
||||
getPartialRegUpdateClearance(const MachineInstr *MI,
|
||||
unsigned OpNum,
|
||||
const TargetRegisterInfo *TRI) const {
|
||||
// Only Swift has partial register update problems.
|
||||
if (!SwiftPartialUpdateClearance || !Subtarget.isSwift())
|
||||
if (!SwiftPartialUpdateClearance ||
|
||||
!(Subtarget.isSwift() || Subtarget.isCortexA15()))
|
||||
return 0;
|
||||
|
||||
assert(TRI && "Need TRI instance");
|
||||
@ -4056,7 +4054,7 @@ getPartialRegUpdateClearance(const MachineInstr *MI,
|
||||
|
||||
// Explicitly reads the dependency.
|
||||
case ARM::VLD1LNd32:
|
||||
UseOp = 1;
|
||||
UseOp = 3;
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
|
@ -185,8 +185,7 @@ bool ARMPassConfig::addPreSched2() {
|
||||
addPass(createARMLoadStoreOptimizationPass());
|
||||
printAndVerify("After ARM load / store optimizer");
|
||||
}
|
||||
if ((DisableA15SDOptimization || !getARMSubtarget().isCortexA15()) &&
|
||||
getARMSubtarget().hasNEON())
|
||||
if (getARMSubtarget().hasNEON())
|
||||
addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
|
||||
}
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
; CHECK-DISABLED: t1:
|
||||
define <2 x float> @t1(float %f) {
|
||||
; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0]
|
||||
; CHECK-DISABLED: vmov.32 d0[1], r{{.}}
|
||||
; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0]
|
||||
%i1 = insertelement <2 x float> undef, float %f, i32 1
|
||||
%i2 = fadd <2 x float> %i1, %i1
|
||||
ret <2 x float> %i2
|
||||
@ -15,7 +15,7 @@ define <2 x float> @t1(float %f) {
|
||||
; CHECK-DISABLED: t2:
|
||||
define <4 x float> @t2(float %g, float %f) {
|
||||
; CHECK-ENABLED: vdup.32 q{{[0-9]*}}, d0[0]
|
||||
; CHECK-DISABLED: vmov.32 d0[1], r{{.}}
|
||||
; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0]
|
||||
%i1 = insertelement <4 x float> undef, float %f, i32 1
|
||||
%i2 = fadd <4 x float> %i1, %i1
|
||||
ret <4 x float> %i2
|
||||
@ -25,6 +25,7 @@ define <4 x float> @t2(float %g, float %f) {
|
||||
; CHECK-DISABLED: t3:
|
||||
define arm_aapcs_vfpcc <2 x float> @t3(float %f) {
|
||||
; CHECK-ENABLED: vdup.32 d{{[0-9]*}}, d0[0]
|
||||
; CHECK-DISABLED-NOT: vdup.32 d{{[0-9]*}}, d0[0]
|
||||
%i1 = insertelement <2 x float> undef, float %f, i32 1
|
||||
%i2 = fadd <2 x float> %i1, %i1
|
||||
ret <2 x float> %i2
|
||||
|
38
test/CodeGen/ARM/a15-partial-update.ll
Normal file
38
test/CodeGen/ARM/a15-partial-update.ll
Normal file
@ -0,0 +1,38 @@
|
||||
; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; CHECK: t1:
|
||||
define <2 x float> @t1(float* %A, <2 x float> %B) {
|
||||
; The generated code for this test uses a vld1.32 instruction
|
||||
; to write the lane 1 of a D register containing the value of
|
||||
; <2 x float> %B. Since the D register is defined, it would
|
||||
; be incorrect to fully write it (with a vmov.f64) before the
|
||||
; vld1.32 instruction. The test checks that a vmov.f64 was not
|
||||
; generated.
|
||||
|
||||
; CHECK-NOT: vmov.{{.*}} d{{[0-9]+}},
|
||||
%tmp2 = load float* %A, align 4
|
||||
%tmp3 = insertelement <2 x float> %B, float %tmp2, i32 1
|
||||
ret <2 x float> %tmp3
|
||||
}
|
||||
|
||||
; CHECK: t2:
|
||||
define void @t2(<4 x i8> *%in, <4 x i8> *%out, i32 %n) {
|
||||
entry:
|
||||
br label %loop
|
||||
loop:
|
||||
; The code generated by this test uses a vld1.32 instruction.
|
||||
; We check that a dependency breaking vmov* instruction was
|
||||
; generated.
|
||||
|
||||
; CHECK: vmov.{{.*}} d{{[0-9]+}},
|
||||
%oldcount = phi i32 [0, %entry], [%newcount, %loop]
|
||||
%newcount = add i32 %oldcount, 1
|
||||
%p1 = getelementptr <4 x i8> *%in, i32 %newcount
|
||||
%p2 = getelementptr <4 x i8> *%out, i32 %newcount
|
||||
%tmp1 = load <4 x i8> *%p1, align 4
|
||||
store <4 x i8> %tmp1, <4 x i8> *%p2
|
||||
%cmp = icmp eq i32 %newcount, %n
|
||||
br i1 %cmp, label %loop, label %ret
|
||||
ret:
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user