mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 02:52:53 +02:00
[ARM, AArch64] Match additional patterns to ldN instructions
When matching an interleaved load to an ldN pattern, the interleaved access pass checks that all users of the load are shuffles. If the load is used by an instruction other than a shuffle, the pass gives up and an ldN is not generated. This patch considers users of the load that are extractelement instructions. It attempts to modify the extracts to use one of the available shuffles rather than the load. After the transformation, the load is only used by shuffles and will then be matched with an ldN pattern. Differential Revision: http://reviews.llvm.org/D20250 llvm-svn: 270142
This commit is contained in:
parent
e86632bcd4
commit
f1715d1306
@ -40,6 +40,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/CodeGen/Passes.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
#include "llvm/IR/InstIterator.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
@ -65,7 +66,7 @@ class InterleavedAccess : public FunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
InterleavedAccess(const TargetMachine *TM = nullptr)
|
||||
: FunctionPass(ID), TM(TM), TLI(nullptr) {
|
||||
: FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
|
||||
initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
@ -73,7 +74,13 @@ public:
|
||||
|
||||
bool runOnFunction(Function &F) override;
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||
}
|
||||
|
||||
private:
|
||||
DominatorTree *DT;
|
||||
const TargetMachine *TM;
|
||||
const TargetLowering *TLI;
|
||||
|
||||
@ -84,13 +91,26 @@ private:
|
||||
/// \brief Transform an interleaved store into target specific intrinsics.
|
||||
bool lowerInterleavedStore(StoreInst *SI,
|
||||
SmallVector<Instruction *, 32> &DeadInsts);
|
||||
|
||||
/// \brief Returns true if the uses of an interleaved load by the
|
||||
/// extractelement instructions in \p Extracts can be replaced by uses of the
|
||||
/// shufflevector instructions in \p Shuffles instead. If so, the necessary
|
||||
/// replacements are also performed.
|
||||
bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
|
||||
ArrayRef<ShuffleVectorInst *> Shuffles);
|
||||
};
|
||||
} // end anonymous namespace.
|
||||
|
||||
char InterleavedAccess::ID = 0;
|
||||
INITIALIZE_TM_PASS(InterleavedAccess, "interleaved-access",
|
||||
"Lower interleaved memory accesses to target specific intrinsics",
|
||||
false, false)
|
||||
INITIALIZE_TM_PASS_BEGIN(
|
||||
InterleavedAccess, "interleaved-access",
|
||||
"Lower interleaved memory accesses to target specific intrinsics", false,
|
||||
false)
|
||||
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
||||
INITIALIZE_TM_PASS_END(
|
||||
InterleavedAccess, "interleaved-access",
|
||||
"Lower interleaved memory accesses to target specific intrinsics", false,
|
||||
false)
|
||||
|
||||
FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
|
||||
return new InterleavedAccess(TM);
|
||||
@ -179,9 +199,18 @@ bool InterleavedAccess::lowerInterleavedLoad(
|
||||
return false;
|
||||
|
||||
SmallVector<ShuffleVectorInst *, 4> Shuffles;
|
||||
SmallVector<ExtractElementInst *, 4> Extracts;
|
||||
|
||||
// Check if all users of this load are shufflevectors.
|
||||
// Check if all users of this load are shufflevectors. If we encounter any
|
||||
// users that are extractelement instructions, we save them to later check if
|
||||
// they can be modifed to extract from one of the shufflevectors instead of
|
||||
// the load.
|
||||
for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
|
||||
auto *Extract = dyn_cast<ExtractElementInst>(*UI);
|
||||
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
|
||||
Extracts.push_back(Extract);
|
||||
continue;
|
||||
}
|
||||
ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
|
||||
if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
|
||||
return false;
|
||||
@ -217,6 +246,11 @@ bool InterleavedAccess::lowerInterleavedLoad(
|
||||
Indices.push_back(Index);
|
||||
}
|
||||
|
||||
// Try and modify users of the load that are extractelement instructions to
|
||||
// use the shufflevector instructions instead of the load.
|
||||
if (!tryReplaceExtracts(Extracts, Shuffles))
|
||||
return false;
|
||||
|
||||
DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
|
||||
|
||||
// Try to create target specific intrinsics to replace the load and shuffles.
|
||||
@ -230,6 +264,73 @@ bool InterleavedAccess::lowerInterleavedLoad(
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InterleavedAccess::tryReplaceExtracts(
|
||||
ArrayRef<ExtractElementInst *> Extracts,
|
||||
ArrayRef<ShuffleVectorInst *> Shuffles) {
|
||||
|
||||
// If there aren't any extractelement instructions to modify, there's nothing
|
||||
// to do.
|
||||
if (Extracts.empty())
|
||||
return true;
|
||||
|
||||
// Maps extractelement instructions to vector-index pairs. The extractlement
|
||||
// instructions will be modified to use the new vector and index operands.
|
||||
DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
|
||||
|
||||
for (auto *Extract : Extracts) {
|
||||
|
||||
// The vector index that is extracted.
|
||||
auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
|
||||
auto Index = IndexOperand->getSExtValue();
|
||||
|
||||
// Look for a suitable shufflevector instruction. The goal is to modify the
|
||||
// extractelement instruction (which uses an interleaved load) to use one
|
||||
// of the shufflevector instructions instead of the load.
|
||||
for (auto *Shuffle : Shuffles) {
|
||||
|
||||
// If the shufflevector instruction doesn't dominate the extract, we
|
||||
// can't create a use of it.
|
||||
if (!DT->dominates(Shuffle, Extract))
|
||||
continue;
|
||||
|
||||
// Inspect the indices of the shufflevector instruction. If the shuffle
|
||||
// selects the same index that is extracted, we can modify the
|
||||
// extractelement instruction.
|
||||
SmallVector<int, 4> Indices;
|
||||
Shuffle->getShuffleMask(Indices);
|
||||
for (unsigned I = 0; I < Indices.size(); ++I)
|
||||
if (Indices[I] == Index) {
|
||||
assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
|
||||
"Vector operations do not match");
|
||||
ReplacementMap[Extract] = std::make_pair(Shuffle, I);
|
||||
break;
|
||||
}
|
||||
|
||||
// If we found a suitable shufflevector instruction, stop looking.
|
||||
if (ReplacementMap.count(Extract))
|
||||
break;
|
||||
}
|
||||
|
||||
// If we did not find a suitable shufflevector instruction, the
|
||||
// extractelement instruction cannot be modified, so we must give up.
|
||||
if (!ReplacementMap.count(Extract))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Finally, perform the replacements.
|
||||
IRBuilder<> Builder(Extracts[0]->getContext());
|
||||
for (auto &Replacement : ReplacementMap) {
|
||||
auto *Extract = Replacement.first;
|
||||
auto *Vector = Replacement.second.first;
|
||||
auto Index = Replacement.second.second;
|
||||
Builder.SetInsertPoint(Extract);
|
||||
Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
|
||||
Extract->eraseFromParent();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InterleavedAccess::lowerInterleavedStore(
|
||||
StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
|
||||
if (!SI->isSimple())
|
||||
@ -262,6 +363,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
|
||||
|
||||
DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
|
||||
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
TLI = TM->getSubtargetImpl(F)->getTargetLowering();
|
||||
MaxFactor = TLI->getMaxSupportedInterleaveFactor();
|
||||
|
||||
|
@ -0,0 +1,86 @@
|
||||
; RUN: opt < %s -mtriple=aarch64 -interleaved-access -S | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: @extract_user_basic(
|
||||
; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
|
||||
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
|
||||
; CHECK: extractelement <4 x i32> %[[R]], i64 1
|
||||
define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
br i1 %C, label %if.then, label %if.merge
|
||||
|
||||
if.then:
|
||||
%E = extractelement <8 x i32> %L, i32 2
|
||||
br label %if.merge
|
||||
|
||||
if.merge:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_multi(
|
||||
; CHECK: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
|
||||
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %ldN, 0
|
||||
; CHECK: extractelement <4 x i32> %[[R]], i64 0
|
||||
; CHECK: extractelement <4 x i32> %[[R]], i64 1
|
||||
define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
br i1 %C, label %if.then, label %if.merge
|
||||
|
||||
if.then:
|
||||
%E1 = extractelement <8 x i32> %L, i32 0
|
||||
br label %if.merge
|
||||
|
||||
if.merge:
|
||||
%E2 = extractelement <8 x i32> %L, i32 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_multi_no_dom(
|
||||
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
|
||||
define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%E1 = extractelement <8 x i32> %L, i32 0
|
||||
br i1 %C, label %if.then, label %if.merge
|
||||
|
||||
if.then:
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E2 = extractelement <8 x i32> %L, i32 2
|
||||
br label %if.merge
|
||||
|
||||
if.merge:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_wrong_const_index(
|
||||
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
|
||||
define void @extract_user_wrong_const_index(<8 x i32>* %A) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E = extractelement <8 x i32> %L, i32 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_undef_index(
|
||||
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
|
||||
define void @extract_user_undef_index(<8 x i32>* %A) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E = extractelement <8 x i32> %L, i32 undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_var_index(
|
||||
; CHECK-NOT: %ldN = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32
|
||||
define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E = extractelement <8 x i32> %L, i32 %I
|
||||
ret void
|
||||
}
|
@ -268,3 +268,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
|
||||
store <3 x float> %tmp1, <3 x float>* %p, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; NEON-LABEL: load_factor2_with_extract_user:
|
||||
; NEON: ld2 { v0.4s, v1.4s }, [x0]
|
||||
; NEON: mov w0, v0.s[1]
|
||||
; NONEON-LABEL: load_factor2_with_extract_user:
|
||||
; NONEON-NOT: ld2
|
||||
define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
|
||||
%1 = load <8 x i32>, <8 x i32>* %a, align 8
|
||||
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%3 = extractelement <8 x i32> %1, i32 2
|
||||
ret i32 %3
|
||||
}
|
||||
|
86
test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll
Normal file
86
test/CodeGen/ARM/arm-interleaved-accesses-extract-user.ll
Normal file
@ -0,0 +1,86 @@
|
||||
; RUN: opt < %s -mtriple=arm-eabi -mattr=+neon -interleaved-access -S | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: @extract_user_basic(
|
||||
; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
|
||||
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
|
||||
; CHECK: extractelement <4 x i32> %[[R]], i64 1
|
||||
define void @extract_user_basic(<8 x i32>* %A, i1 %C) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
br i1 %C, label %if.then, label %if.merge
|
||||
|
||||
if.then:
|
||||
%E = extractelement <8 x i32> %L, i32 2
|
||||
br label %if.merge
|
||||
|
||||
if.merge:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_multi(
|
||||
; CHECK: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
|
||||
; CHECK: %[[R:.+]] = extractvalue { <4 x i32>, <4 x i32> } %vldN, 0
|
||||
; CHECK: extractelement <4 x i32> %[[R]], i64 0
|
||||
; CHECK: extractelement <4 x i32> %[[R]], i64 1
|
||||
define void @extract_user_multi(<8 x i32>* %A, i1 %C) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
br i1 %C, label %if.then, label %if.merge
|
||||
|
||||
if.then:
|
||||
%E1 = extractelement <8 x i32> %L, i32 0
|
||||
br label %if.merge
|
||||
|
||||
if.merge:
|
||||
%E2 = extractelement <8 x i32> %L, i32 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_multi_no_dom(
|
||||
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
|
||||
define void @extract_user_multi_no_dom(<8 x i32>* %A, i1 %C) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%E1 = extractelement <8 x i32> %L, i32 0
|
||||
br i1 %C, label %if.then, label %if.merge
|
||||
|
||||
if.then:
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E2 = extractelement <8 x i32> %L, i32 2
|
||||
br label %if.merge
|
||||
|
||||
if.merge:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_wrong_const_index(
|
||||
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
|
||||
define void @extract_user_wrong_const_index(<8 x i32>* %A) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E = extractelement <8 x i32> %L, i32 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_undef_index(
|
||||
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
|
||||
define void @extract_user_undef_index(<8 x i32>* %A) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E = extractelement <8 x i32> %L, i32 undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_user_var_index(
|
||||
; CHECK-NOT: %vldN = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8
|
||||
define void @extract_user_var_index(<8 x i32>* %A, i32 %I) {
|
||||
entry:
|
||||
%L = load <8 x i32>, <8 x i32>* %A, align 8
|
||||
%S = shufflevector <8 x i32> %L, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%E = extractelement <8 x i32> %L, i32 %I
|
||||
ret void
|
||||
}
|
@ -304,3 +304,15 @@ define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
|
||||
store <3 x float> %tmp1, <3 x float>* %p, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; NEON-LABEL: load_factor2_with_extract_user:
|
||||
; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64]
|
||||
; NEON: vmov.32 r0, d16[1]
|
||||
; NONEON-LABEL: load_factor2_with_extract_user:
|
||||
; NONEON-NOT: vld2
|
||||
define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
|
||||
%1 = load <8 x i32>, <8 x i32>* %a, align 8
|
||||
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
%3 = extractelement <8 x i32> %1, i32 2
|
||||
ret i32 %3
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user