1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

Reland "[PowerPC] Implement instruction clustering for stores"

Commit 3c0b3250 introduced store fusion for PowerPC target, but it
brought failure under UB sanitizer and was reverted. This patch fixes
them.
This commit is contained in:
Qiu Chaofan 2020-09-13 19:39:49 +08:00
parent dcb7b4b709
commit 28702b5551
8 changed files with 406 additions and 6 deletions

View File

@ -174,6 +174,9 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
"HasAddisLoadFusion", "true",
"Power8 Addis-Load fusion",
[FeatureFusion]>;
def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true",
"Target supports store clustering",
[FeatureFusion]>;
def FeatureUnalignedFloats :
SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
"true", "CPU does not trap on unaligned FP access">;
@ -345,10 +348,12 @@ def ProcessorFeatures {
// Power10
// For P10 CPU we assume that all of the existing features from Power9
// still exist with the exception of those we know are Power9 specific.
list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion];
list<SubtargetFeature> P10AdditionalFeatures =
[DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
!listconcat(FusionFeatures, [
DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA,
FeaturePairedVectorMemops];
FeaturePairedVectorMemops]);
list<SubtargetFeature> P10SpecificFeatures = [];
list<SubtargetFeature> P10InheritableFeatures =
!listconcat(P9InheritableFeatures, P10AdditionalFeatures);

View File

@ -2222,6 +2222,112 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return true;
}
bool PPCInstrInfo::getMemOperandsWithOffsetWidth(
const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const {
const MachineOperand *BaseOp;
OffsetIsScalable = false;
if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI))
return false;
BaseOps.push_back(BaseOp);
return true;
}
static bool isLdStSafeToCluster(const MachineInstr &LdSt,
const TargetRegisterInfo *TRI) {
// If this is a volatile load/store, don't mess with it.
if (LdSt.hasOrderedMemoryRef() || LdSt.getNumExplicitOperands() != 3)
return false;
if (LdSt.getOperand(2).isFI())
return true;
assert(LdSt.getOperand(2).isReg() && "Expected a reg operand.");
// Can't cluster if the instruction modifies the base register
// or it is update form. e.g. ld r2,3(r2)
if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI))
return false;
return true;
}
// Only cluster instruction pair that have the same opcode, and they are
// clusterable according to PowerPC specification.
static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc,
const PPCSubtarget &Subtarget) {
switch (FirstOpc) {
default:
return false;
case PPC::STD:
case PPC::STFD:
case PPC::STXSD:
case PPC::DFSTOREf64:
return FirstOpc == SecondOpc;
// PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with
// 32bit and 64bit instruction selection. They are clusterable pair though
// they are different opcode.
case PPC::STW:
case PPC::STW8:
return SecondOpc == PPC::STW || SecondOpc == PPC::STW8;
}
}
bool PPCInstrInfo::shouldClusterMemOps(
ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
unsigned NumBytes) const {
assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
const MachineOperand &BaseOp1 = *BaseOps1.front();
const MachineOperand &BaseOp2 = *BaseOps2.front();
assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
"Only base registers and frame indices are supported.");
// The NumLoads means the number of loads that has been clustered.
// Don't cluster memory op if there are already two ops clustered at least.
if (NumLoads > 2)
return false;
// Cluster the load/store only when they have the same base
// register or FI.
if ((BaseOp1.isReg() != BaseOp2.isReg()) ||
(BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) ||
(BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex()))
return false;
// Check if the load/store are clusterable according to the PowerPC
// specification.
const MachineInstr &FirstLdSt = *BaseOp1.getParent();
const MachineInstr &SecondLdSt = *BaseOp2.getParent();
unsigned FirstOpc = FirstLdSt.getOpcode();
unsigned SecondOpc = SecondLdSt.getOpcode();
const TargetRegisterInfo *TRI = &getRegisterInfo();
// Cluster the load/store only when they have the same opcode, and they are
// clusterable opcode according to PowerPC specification.
if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget))
return false;
// Can't cluster load/store that have ordered or volatile memory reference.
if (!isLdStSafeToCluster(FirstLdSt, TRI) ||
!isLdStSafeToCluster(SecondLdSt, TRI))
return false;
int64_t Offset1 = 0, Offset2 = 0;
unsigned Width1 = 0, Width2 = 0;
const MachineOperand *Base1 = nullptr, *Base2 = nullptr;
if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) ||
!getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) ||
Width1 != Width2)
return false;
assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 &&
"getMemOperandWithOffsetWidth return incorrect base op");
// The caller should already have ordered FirstMemOp/SecondMemOp by offset.
assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
return Offset1 + Width1 == Offset2;
}
/// GetInstSize - Return the number of bytes of code the specified
/// instruction may be. This returns the maximum number of bytes.
///
@ -4664,7 +4770,8 @@ bool PPCInstrInfo::getMemOperandWithOffsetWidth(
return false;
// Handle only loads/stores with base register followed by immediate offset.
if (LdSt.getNumExplicitOperands() != 3)
if (!LdSt.getOperand(1).isImm() ||
(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))
return false;
if (!LdSt.getOperand(1).isImm() ||
(!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()))

View File

@ -494,6 +494,19 @@ public:
int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const;
/// Get the base operand and byte offset of an instruction that reads/writes
/// memory.
bool getMemOperandsWithOffsetWidth(
const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const override;
/// Returns true if the two given memory operations should be scheduled
/// adjacent.
bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2,
unsigned NumLoads, unsigned NumBytes) const override;
/// Return true if two MIs access different memory addresses and false
/// otherwise
bool

View File

@ -120,6 +120,7 @@ void PPCSubtarget::initializeEnvironment() {
HasHTM = false;
HasFloat128 = false;
HasFusion = false;
HasStoreFusion = false;
HasAddiLoadFusion = false;
HasAddisLoadFusion = false;
IsISA3_0 = false;

View File

@ -140,6 +140,7 @@ protected:
bool HasHTM;
bool HasFloat128;
bool HasFusion;
bool HasStoreFusion;
bool HasAddiLoadFusion;
bool HasAddisLoadFusion;
bool IsISA3_0;
@ -317,6 +318,7 @@ public:
bool isISA3_1() const { return IsISA3_1; }
bool useLongCalls() const { return UseLongCalls; }
bool hasFusion() const { return HasFusion; }
bool hasStoreFusion() const { return HasStoreFusion; }
bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
bool needsSwapsForVSXMemOps() const {

View File

@ -278,6 +278,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
std::make_unique<GenericScheduler>(C));
// add DAG Mutations here.
DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
if (ST.hasStoreFusion())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.hasFusion())
DAG->addMutation(createPowerPCMacroFusionDAGMutation());
@ -292,6 +294,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
std::make_unique<PPCPostRASchedStrategy>(C) :
std::make_unique<PostGenericScheduler>(C), true);
// add DAG Mutations here.
if (ST.hasStoreFusion())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.hasFusion())
DAG->addMutation(createPowerPCMacroFusionDAGMutation());
return DAG;

View File

@ -0,0 +1,268 @@
; Test if several consecutive loads/stores can be clustered(fused) by scheduler. The
; scheduler will print "Cluster ld/st SU(x) - SU(y)" if SU(x) and SU(y) are fused.
; REQUIRES: asserts
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 \
; RUN: -mattr=-paired-vector-memops,-pcrelative-memops -verify-misched \
; RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s
define i64 @store_i64(i64* nocapture %P, i64 %v) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i64:%bb.0
; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24
; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16
; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8
; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i64:%bb.0
; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 16
; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 8
; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 24
; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32
%arrayidx = getelementptr inbounds i64, i64* %P, i64 3
store i64 %v, i64* %arrayidx
%arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
store i64 %v, i64* %arrayidx1
%arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
store i64 %v, i64* %arrayidx2
%arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
store i64 %v, i64* %arrayidx3
ret i64 %v
}
define i32 @store_i32(i32* nocapture %P, i32 %v) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32:%bb.0
; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, 52
; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, 48
; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, 44
; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, 56
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32:%bb.0
; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
; CHECK: SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], 48
; CHECK: SU([[SU1]]): STW renamable $r[[REG]], 44
; CHECK: SU([[SU2]]): STW renamable $r[[REG]], 52
; CHECK: SU([[SU3]]): STW renamable $r[[REG]], 56
%arrayidx = getelementptr inbounds i32, i32* %P, i32 13
store i32 %v, i32* %arrayidx
%arrayidx1 = getelementptr inbounds i32, i32* %P, i32 12
store i32 %v, i32* %arrayidx1
%arrayidx2 = getelementptr inbounds i32, i32* %P, i32 11
store i32 %v, i32* %arrayidx2
%arrayidx3 = getelementptr inbounds i32, i32* %P, i32 14
store i32 %v, i32* %arrayidx3
ret i32 %v
}
define void @store_i64_neg(i64* nocapture %P, i64 %v) #0 {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i64_neg:%bb.0
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, -24
; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, -8
; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, -16
; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, -32
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i64_neg:%bb.0
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], -8
; CHECK: SU([[SU1]]): STD renamable $x[[REG]], -16
; CHECK: SU([[SU2]]): STD renamable $x[[REG]], -24
; CHECK: SU([[SU3]]): STD renamable $x[[REG]], -32
%arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
store i64 %v, i64* %arrayidx
%arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1
store i64 %v, i64* %arrayidx1
%arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2
store i64 %v, i64* %arrayidx2
%arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4
store i64 %v, i64* %arrayidx3
ret void
}
define void @store_i32_neg(i32* nocapture %P, i32 %v) #0 {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32_neg:%bb.0
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, -12
; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, -4
; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, -8
; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, -16
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32_neg:%bb.0
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
; CHECK:SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], -4
; CHECK:SU([[SU1]]): STW renamable $r[[REG]], -8
; CHECK:SU([[SU2]]): STW renamable $r[[REG]], -12
; CHECK:SU([[SU3]]): STW renamable $r[[REG]], -16
%arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
store i32 %v, i32* %arrayidx
%arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1
store i32 %v, i32* %arrayidx1
%arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2
store i32 %v, i32* %arrayidx2
%arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4
store i32 %v, i32* %arrayidx3
ret void
}
define void @store_double(double* nocapture %P, double %v) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_double:%bb.0
; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]])
; CHECK: SU([[SU2]]): DFSTOREf64 %[[REG:[0-9]+]]:vsfrc, 24
; CHECK: SU([[SU3]]): DFSTOREf64 %[[REG]]:vsfrc, 8
; CHECK: SU([[SU4]]): DFSTOREf64 %[[REG]]:vsfrc, 16
; CHECK: SU([[SU5]]): DFSTOREf64 %[[REG]]:vsfrc, 32
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_double:%bb.0
; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]])
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
; CHECK: SU([[SU0]]): STFD renamable $f[[REG:[0-9]+]], 8
; CHECK: SU([[SU1]]): STFD renamable $f[[REG]], 16
; CHECK: SU([[SU2]]): STFD renamable $f[[REG]], 24
; CHECK: SU([[SU3]]): STFD renamable $f[[REG]], 32
%arrayidx = getelementptr inbounds double, double* %P, i64 3
store double %v, double* %arrayidx
%arrayidx1 = getelementptr inbounds double, double* %P, i64 1
store double %v, double* %arrayidx1
%arrayidx2 = getelementptr inbounds double, double* %P, i64 2
store double %v, double* %arrayidx2
%arrayidx3 = getelementptr inbounds double, double* %P, i64 4
store double %v, double* %arrayidx3
ret void
}
define void @store_float(float* nocapture %P, float %v) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_float:%bb.0
; CHECK-NOT: Cluster ld/st
; CHECK-NOT: Cluster ld/st
; CHECK: SU([[SU2]]): DFSTOREf32 %[[REG:[0-9]+]]:vssrc, 12
; CHECK: SU([[SU3]]): DFSTOREf32 %[[REG]]:vssrc, 4
; CHECK: SU([[SU4]]): DFSTOREf32 %[[REG]]:vssrc, 8
; CHECK: SU([[SU5]]): DFSTOREf32 %[[REG]]:vssrc, 16
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_float:%bb.0
; CHECK-NOT: Cluster ld/st
; CHECK-NOT: Cluster ld/st
; CHECK: SU([[SU0]]): STFS renamable $f[[REG:[0-9]+]], 12
; CHECK: SU([[SU1]]): STFS renamable $f[[REG]], 4
; CHECK: SU([[SU2]]): STFS renamable $f[[REG]], 8
; CHECK: SU([[SU3]]): STFS renamable $f[[REG]], 16
%arrayidx = getelementptr inbounds float, float* %P, i64 3
store float %v, float* %arrayidx
%arrayidx1 = getelementptr inbounds float, float* %P, i64 1
store float %v, float* %arrayidx1
%arrayidx2 = getelementptr inbounds float, float* %P, i64 2
store float %v, float* %arrayidx2
%arrayidx3 = getelementptr inbounds float, float* %P, i64 4
store float %v, float* %arrayidx3
ret void
}
; Cannot fuse the store/load if there is volatile in between
define i64 @store_volatile(i64* nocapture %P, i64 %v) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_volatile:%bb.0
; CHECK-NOT: Cluster ld/st
; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24
; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16
; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8
; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_volatile:%bb.0
; CHECK-NOT: Cluster ld/st
; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 24
; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 16
; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 8
; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32
%arrayidx = getelementptr inbounds i64, i64* %P, i64 3
store volatile i64 %v, i64* %arrayidx
%arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
store volatile i64 %v, i64* %arrayidx1
%arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
store volatile i64 %v, i64* %arrayidx2
%arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
store volatile i64 %v, i64* %arrayidx3
ret i64 %v
}
@p = common local_unnamed_addr global [100 x i32] zeroinitializer, align 4
define void @store_i32_stw_stw8(i32 signext %m, i32 signext %n) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32_stw_stw8:%bb.0
; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU8:[0-9]+]])
; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 24
; CHECK: SU([[SU8]]): STW %{{[0-9]+}}:gprc, 20
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32_stw_stw8:%bb.0
; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU6:[0-9]+]])
; CHECK: SU([[SU5]]): STW8 renamable $x{{[0-9]+}}, 24
; CHECK: SU([[SU6]]): STW renamable $r{{[0-9]+}}, 20
store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4
store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4
%add = add nsw i32 %n, %m
store i32 %add, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 5), align 4
ret void
}
define void @store_i32_stw8(i32 signext %m, i32 signext %n) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32_stw8:%bb.0
; CHECK: Cluster ld/st SU([[SU4:[0-9]+]]) - SU([[SU5:[0-9]+]])
; CHECK: SU([[SU4]]): STW8 %{{[0-9]+}}:g8rc, 24
; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 28
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_i32_stw8:%bb.0
; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
; CHECK: SU([[SU3]]): STW8 renamable $x{{[0-9]+}}, 24
; CHECK: SU([[SU4]]): STW8 renamable $x{{[0-9]+}}, 28
store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4
store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4
ret void
}
declare void @bar(i64*)
define void @store_frame_index(i32 %a, i32 %b) {
entry:
; CHECK: ********** MI Scheduling **********
; CHECK-LABEL: store_frame_index:%bb.0
; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]])
; CHECK: SU([[SU2]]): STD %{{[0-9]+}}:g8rc, 0, %stack.0.buf
; CHECK: SU([[SU3]]): STD %{{[0-9]+}}:g8rc, 8, %stack.0.buf
%buf = alloca [8 x i64], align 8
%0 = bitcast [8 x i64]* %buf to i8*
%conv = zext i32 %a to i64
%arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 0
store i64 %conv, i64* %arrayidx, align 8
%conv1 = zext i32 %b to i64
%arrayidx2 = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 1
store i64 %conv1, i64* %arrayidx2, align 8
call void @bar(i64* nonnull %arrayidx)
ret void
}

View File

@ -104,15 +104,15 @@ define dso_local signext i32 @X2IsCallerSaved(i32 signext %a, i32 signext %b, i3
; CHECK-P9-NOT: .localentry
; CHECK-ALL: # %bb.0: # %entry
; CHECK-S-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-S-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-S-NEXT: add r11, r4, r3
; CHECK-S-NEXT: sub r29, r8, r9
; CHECK-S-NEXT: add r9, r10, r9
; CHECK-S-NEXT: sub r10, r10, r3
; CHECK-S-NEXT: mullw r3, r4, r3
; CHECK-S-NEXT: sub r12, r4, r5
; CHECK-S-NEXT: add r0, r6, r5
; CHECK-S-NEXT: sub r2, r6, r7
; CHECK-S-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-S-NEXT: mullw r3, r4, r3
; CHECK-S-NEXT: add r30, r8, r7
; CHECK-S-NEXT: mullw r3, r3, r11
; CHECK-S-NEXT: mullw r3, r3, r5