1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[ARM, AArch64] Match additional patterns to ldN instructions

When matching an interleaved load to an ldN pattern, the interleaved access
pass checks that all users of the load are shuffles. If the load is used by an
instruction other than a shuffle, the pass gives up and an ldN is not
generated. This patch considers users of the load that are extractelement
instructions. It attempts to modify the extracts to use one of the available
shuffles rather than the load. After the transformation, the load is only used
by shuffles and will then be matched with an ldN pattern.

Differential Revision: http://reviews.llvm.org/D20250

llvm-svn: 270142
This commit is contained in:
Matthew Simpson 2016-05-19 21:39:00 +00:00
parent e86632bcd4
commit f1715d1306
5 changed files with 303 additions and 5 deletions

View File

@ -40,6 +40,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
@ -65,7 +66,7 @@ class InterleavedAccess : public FunctionPass {
public:
static char ID;
InterleavedAccess(const TargetMachine *TM = nullptr)
: FunctionPass(ID), TM(TM), TLI(nullptr) {
: FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
}
@ -73,7 +74,13 @@ public:
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
}
private:
DominatorTree *DT;
const TargetMachine *TM;
const TargetLowering *TLI;
@ -84,13 +91,26 @@ private:
/// \brief Transform an interleaved store into target specific intrinsics.
bool lowerInterleavedStore(StoreInst *SI,
SmallVector<Instruction *, 32> &DeadInsts);
/// \brief Returns true if the uses of an interleaved load by the
/// extractelement instructions in \p Extracts can be replaced by uses of the
/// shufflevector instructions in \p Shuffles instead. If so, the necessary
/// replacements are also performed.
bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
ArrayRef<ShuffleVectorInst *> Shuffles);
};
} // end anonymous namespace.
char InterleavedAccess::ID = 0;
INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access",
"Lower interleaved memory accesses to target specific intrinsics",
false, false)
INITIALIZE_TM_PASS_BEGIN(
InterleavedAccess, "interleaved-access",
"Lower interleaved memory accesses to target specific intrinsics", false,
false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_TM_PASS_END(
InterleavedAccess, "interleaved-access",
"Lower interleaved memory accesses to target specific intrinsics", false,
false)
FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
return new InterleavedAccess(TM);
@ -179,9 +199,18 @@ bool InterleavedAccess::lowerInterleavedLoad(
return false;
SmallVector<ShuffleVectorInst *, 4> Shuffles;
SmallVector<ExtractElementInst *, 4> Extracts;
// Check if all users of this load are shufflevectors.
// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions, we save them to later check if
// they can be modifed to extract from one of the shufflevectors instead of
// the load.
for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
auto *Extract = dyn_cast<ExtractElementInst>(*UI);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
continue;
}
ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
return false;
@ -217,6 +246,11 @@ bool InterleavedAccess::lowerInterleavedLoad(
Indices.push_back(Index);
}
// Try and modify users of the load that are extractelement instructions to
// use the shufflevector instructions instead of the load.
if (!tryReplaceExtracts(Extracts, Shuffles))
return false;
DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
// Try to create target specific intrinsics to replace the load and shuffles.
@ -230,6 +264,73 @@ bool InterleavedAccess::lowerInterleavedLoad(
return true;
}
bool InterleavedAccess::tryReplaceExtracts(
ArrayRef<ExtractElementInst *> Extracts,
ArrayRef<ShuffleVectorInst *> Shuffles) {
// If there aren't any extractelement instructions to modify, there's nothing
// to do.
if (Extracts.empty())
return true;
// Maps extractelement instructions to vector-index pairs. The extractlement
// instructions will be modified to use the new vector and index operands.
DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
for (auto *Extract : Extracts) {
// The vector index that is extracted.
auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
auto Index = IndexOperand->getSExtValue();
// Look for a suitable shufflevector instruction. The goal is to modify the
// extractelement instruction (which uses an interleaved load) to use one
// of the shufflevector instructions instead of the load.
for (auto *Shuffle : Shuffles) {
// If the shufflevector instruction doesn't dominate the extract, we
// can't create a use of it.
if (!DT->dominates(Shuffle, Extract))
continue;
// Inspect the indices of the shufflevector instruction. If the shuffle
// selects the same index that is extracted, we can modify the
// extractelement instruction.
SmallVector<int, 4> Indices;
Shuffle->getShuffleMask(Indices);
for (unsigned I = 0; I < Indices.size(); ++I)
if (Indices[I] == Index) {
assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
"Vector operations do not match");
ReplacementMap[Extract] = std::make_pair(Shuffle, I);
break;
}
// If we found a suitable shufflevector instruction, stop looking.
if (ReplacementMap.count(Extract))
break;
}
// If we did not find a suitable shufflevector instruction, the
// extractelement instruction cannot be modified, so we must give up.
if (!ReplacementMap.count(Extract))
return false;
}
// Finally, perform the replacements.
IRBuilder<> Builder(Extracts[0]->getContext());
for (auto &Replacement : ReplacementMap) {
auto *Extract = Replacement.first;
auto *Vector = Replacement.second.first;
auto Index = Replacement.second.second;
Builder.SetInsertPoint(Extract);
Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
Extract->eraseFromParent();
}
return true;
}
bool InterleavedAccess::lowerInterleavedStore(
StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
if (!SI->isSimple())
@ -262,6 +363,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TLI = TM->getSubtargetImpl(F)->getTargetLowering();
MaxFactor = TLI->getMaxSupportedInterleaveFactor();

View File

@ -0,0 +1,86 @@
; RUN: opt < %s -mtriple=aarch64 -interleaved-access -S | FileCheck %s
; CHECK-LABEL: @extract_user_basic(
; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
; CHECK: extractelement <4 x i32> %[[R]], i64 1
define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
br i1 %C, label %if.then, label %if.merge
if.then:
%E = extractelement <8 x i32> %L, i32 2
br label %if.merge
if.merge:
ret void
}
; CHECK-LABEL: @extract_user_multi(
; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
; CHECK: extractelement <4 x i32> %[[R]], i64 0
; CHECK: extractelement <4 x i32> %[[R]], i64 1
define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
br i1 %C, label %if.then, label %if.merge
if.then:
%E1 = extractelement <8 x i32> %L, i32 0
br label %if.merge
if.merge:
%E2 = extractelement <8 x i32> %L, i32 2
ret void
}
; CHECK-LABEL: @extract_user_multi_no_dom(
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%E1 = extractelement <8 x i32> %L, i32 0
br i1 %C, label %if.then, label %if.merge
if.then:
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E2 = extractelement <8 x i32> %L, i32 2
br label %if.merge
if.merge:
ret void
}
; CHECK-LABEL: @extract_user_wrong_const_index(
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
define void @extract_user_wrong_const_index(<8 x i32>* %A) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E = extractelement <8 x i32> %L, i32 1
ret void
}
; CHECK-LABEL: @extract_user_undef_index(
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
define void @extract_user_undef_index(<8 x i32>* %A) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E = extractelement <8 x i32> %L, i32 undef
ret void
}
; CHECK-LABEL: @extract_user_var_index(
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E = extractelement <8 x i32> %L, i32 %I
ret void
}

View File

@ -268,3 +268,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
store <3 x float> %tmp1, <3 x float>* %p, align 16
ret void
}
; NEON-LABEL: load_factor2_with_extract_user:
; NEON: ld2 { v0.4s, v1.4s }, [x0]
; NEON: mov w0, v0.s[1]
; NONEON-LABEL: load_factor2_with_extract_user:
; NONEON-NOT: ld2
define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
%1 = load <8 x i32>, <8 x i32>* %a, align 8
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%3 = extractelement <8 x i32> %1, i32 2
ret i32 %3
}

View File

@ -0,0 +1,86 @@
; RUN: opt < %s -mtriple=arm-eabi -mattr=+neon -interleaved-access -S | FileCheck %s
; CHECK-LABEL: @extract_user_basic(
; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
; CHECK: extractelement <4 x i32> %[[R]], i64 1
define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
br i1 %C, label %if.then, label %if.merge
if.then:
%E = extractelement <8 x i32> %L, i32 2
br label %if.merge
if.merge:
ret void
}
; CHECK-LABEL: @extract_user_multi(
; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
; CHECK: extractelement <4 x i32> %[[R]], i64 0
; CHECK: extractelement <4 x i32> %[[R]], i64 1
define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
br i1 %C, label %if.then, label %if.merge
if.then:
%E1 = extractelement <8 x i32> %L, i32 0
br label %if.merge
if.merge:
%E2 = extractelement <8 x i32> %L, i32 2
ret void
}
; CHECK-LABEL: @extract_user_multi_no_dom(
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%E1 = extractelement <8 x i32> %L, i32 0
br i1 %C, label %if.then, label %if.merge
if.then:
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E2 = extractelement <8 x i32> %L, i32 2
br label %if.merge
if.merge:
ret void
}
; CHECK-LABEL: @extract_user_wrong_const_index(
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
define void @extract_user_wrong_const_index(<8 x i32>* %A) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E = extractelement <8 x i32> %L, i32 1
ret void
}
; CHECK-LABEL: @extract_user_undef_index(
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
define void @extract_user_undef_index(<8 x i32>* %A) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E = extractelement <8 x i32> %L, i32 undef
ret void
}
; CHECK-LABEL: @extract_user_var_index(
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
entry:
%L = load <8 x i32>, <8 x i32>* %A, align 8
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%E = extractelement <8 x i32> %L, i32 %I
ret void
}

View File

@ -304,3 +304,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
store <3 x float> %tmp1, <3 x float>* %p, align 16
ret void
}
; NEON-LABEL: load_factor2_with_extract_user:
; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64]
; NEON: vmov.32 r0, d16[1]
; NONEON-LABEL: load_factor2_with_extract_user:
; NONEON-NOT: vld2
define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
%1 = load <8 x i32>, <8 x i32>* %a, align 8
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%3 = extractelement <8 x i32> %1, i32 2
ret i32 %3
}