1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 18:42:46 +02:00

[Matrix] Add optimization remarks for matrix expression.

Generate remarks for matrix operations in a function. To generate remarks
for matrix expressions, the following approach is used:
1. Collect leafs of matrix expressions (done in
   RemarkGenerator::getExpressionLeafs).  Leafs are lowered matrix
   instructions without other matrix users (like stores).

2. For each leaf, create a remark containing a linearizied version of the
   matrix expression.

The following improvements will be submitted as follow-ups:
* Summarize number of vector instructions generated for each expression.
* Account for shared sub-expressions.
* Propagate matrix remarks up the inlining chain.

The information provided by the matrix remarks helps users to spot cases
where matrix expression got split up, e.g. due to inlining not
happening. The remarks allow users to address those issues, ensuring
best performance.

Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D72453
This commit is contained in:
Florian Hahn 2020-01-27 16:22:04 -08:00
parent d65797733b
commit 47a33e71d6
2 changed files with 502 additions and 7 deletions

View File

@ -10,7 +10,8 @@
// //
// TODO: // TODO:
// * Implement multiply & add fusion // * Implement multiply & add fusion
// * Add remark, summarizing the available matrix optimization opportunities. // * Add remark, summarizing the available matrix optimization opportunities
// (WIP).
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
@ -18,7 +19,9 @@
#include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h" #include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/CFG.h" #include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h" #include "llvm/IR/DataLayout.h"
@ -136,6 +139,7 @@ class LowerMatrixIntrinsics {
Function &Func; Function &Func;
const DataLayout &DL; const DataLayout &DL;
const TargetTransformInfo &TTI; const TargetTransformInfo &TTI;
OptimizationRemarkEmitter &ORE;
/// Wrapper class representing a matrix as a set of column vectors. /// Wrapper class representing a matrix as a set of column vectors.
/// All column vectors must have the same vector type. /// All column vectors must have the same vector type.
@ -213,11 +217,12 @@ class LowerMatrixIntrinsics {
SmallVector<Instruction *, 16> ToRemove; SmallVector<Instruction *, 16> ToRemove;
/// Map from instructions to their produced column matrix. /// Map from instructions to their produced column matrix.
DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix; MapVector<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
public: public:
LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI) LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {} OptimizationRemarkEmitter &ORE)
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {}
/// Return the set of column vectors that a matrix value is lowered to. /// Return the set of column vectors that a matrix value is lowered to.
/// ///
@ -509,6 +514,9 @@ public:
} }
} }
RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
RemarkGen.emitRemarks();
for (Instruction *Inst : reverse(ToRemove)) for (Instruction *Inst : reverse(ToRemove))
Inst->eraseFromParent(); Inst->eraseFromParent();
@ -599,6 +607,7 @@ public:
Shape.NumRows, VType->getElementType(), Builder); Shape.NumRows, VType->getElementType(), Builder);
createColumnStore(C.value(), GEP, VType->getElementType(), Builder); createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
} }
Inst2ColumnMatrix[Inst] = ColumnMatrixTy();
ToRemove.push_back(Inst); ToRemove.push_back(Inst);
} }
@ -844,13 +853,301 @@ public:
finalizeLowering(Inst, Result, Builder); finalizeLowering(Inst, Result, Builder);
return true; return true;
} }
/// Helper to linearize a matrix expression tree into a string. Currently
/// matrix expressions are linarized by starting at an expression leaf and
/// linearizing bottom up.
struct ExprLinearizer {
unsigned LengthToBreak = 100;
std::string Str;
raw_string_ostream Stream;
unsigned LineLength = 0;
const DataLayout &DL;
/// Mapping from instructions to column matrixes. It is used to identify
/// matrix instructions.
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
/// Used to keep track of sub-expressions that get reused while linearizing
/// the expression. Re-used sub-expressions are marked as (reused).
SmallPtrSet<Value *, 8> ReusedExprs;
ExprLinearizer(const DataLayout &DL,
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix)
: Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix) {}
void indent(unsigned N) {
LineLength += N;
for (unsigned i = 0; i < N; i++)
Stream << " ";
}
void lineBreak() {
Stream << "\n";
LineLength = 0;
}
void maybeIndent(unsigned Indent) {
if (LineLength >= LengthToBreak)
lineBreak();
if (LineLength == 0)
indent(Indent);
}
void write(const std::string &S) {
LineLength += S.size();
Stream << S;
}
Value *getUnderlyingObjectThroughLoads(Value *V) {
if (Value *Ptr = getPointerOperand(V))
return getUnderlyingObjectThroughLoads(Ptr);
else if (V->getType()->isPointerTy())
return GetUnderlyingObject(V, DL);
return V;
}
/// Returns true if \p V is a matrix value.
bool isMatrix(Value *V) const {
return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
}
/// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
/// \p SS.
void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
auto M = Inst2ColumnMatrix.find(V);
if (M == Inst2ColumnMatrix.end())
SS << "unknown";
else {
SS << M->second.getNumRows();
SS << "x";
SS << M->second.getNumColumns();
}
}
/// Write the called function name. Handles calls to llvm.matrix.*
/// specially: we write the name, followed by the dimensions of the input
/// matrixes, followed by the scalar type name.
void writeFnName(CallInst *CI) {
if (!CI->getCalledFunction())
write("<no called fn>");
else {
StringRef Name = CI->getCalledFunction()->getName();
if (!Name.startswith("llvm.matrix")) {
write(Name);
return;
}
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
.drop_front(StringRef("llvm.matrix.").size()));
write(".");
std::string Tmp = "";
raw_string_ostream SS(Tmp);
switch (II->getIntrinsicID()) {
case Intrinsic::matrix_multiply:
prettyPrintMatrixType(II->getOperand(0), SS);
SS << ".";
prettyPrintMatrixType(II->getOperand(1), SS);
SS << "." << *II->getType()->getScalarType();
break;
case Intrinsic::matrix_transpose:
prettyPrintMatrixType(II->getOperand(0), SS);
SS << "." << *II->getType()->getScalarType();
break;
case Intrinsic::matrix_columnwise_load:
prettyPrintMatrixType(II, SS);
SS << "." << *II->getType()->getScalarType();
break;
case Intrinsic::matrix_columnwise_store:
prettyPrintMatrixType(II->getOperand(0), SS);
SS << "." << *II->getOperand(0)->getType()->getScalarType();
break;
default:
llvm_unreachable("Unhandled case");
}
SS.flush();
write(Tmp);
}
}
unsigned getNumShapeArgs(CallInst *CI) const {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
switch (II->getIntrinsicID()) {
case Intrinsic::matrix_multiply:
return 3;
case Intrinsic::matrix_transpose:
case Intrinsic::matrix_columnwise_load:
case Intrinsic::matrix_columnwise_store:
return 2;
default:
return 0;
}
}
return 0;
}
/// Special printing for values: for pointers, we print if they refer to an
/// (function) external address or a stack address, for other values we
/// either print the constant or "scalar"/"matrix" for other values.
void write(Value *V) {
V = getUnderlyingObjectThroughLoads(V);
if (V->getType()->isPointerTy()) {
if (isa<AllocaInst>(V)) {
Stream << "stack addr";
LineLength += StringRef("stack addr").size();
} else {
Stream << "addr";
LineLength += StringRef("addr").size();
}
if (!V->getName().empty()) {
Stream << " %" << V->getName() << "";
LineLength += V->getName().size() + 2;
}
return;
}
std::string Tmp;
raw_string_ostream TmpStream(Tmp);
if (auto *CI = dyn_cast<ConstantInt>(V))
TmpStream << CI->getValue();
else if (isa<Constant>(V))
TmpStream << "constant";
else {
if (isMatrix(V))
TmpStream << "matrix";
else
TmpStream << "scalar";
}
TmpStream.flush();
Tmp = StringRef(Tmp).trim();
LineLength += Tmp.size();
Stream << Tmp;
}
/// Linearize expression \p Expr starting at an indentation of \p Indent.
/// Expressions that are re-used multiple times are prefixed with (reused)
/// at the re-used root instruction.
void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused) {
auto *I = cast<Instruction>(Expr);
maybeIndent(Indent);
SmallVector<Value *, 8> Ops;
bool Reused = !ReusedExprs.insert(Expr).second;
if (Reused && !ParentReused)
write("(reused) ");
if (auto *CI = dyn_cast<CallInst>(I)) {
writeFnName(CI);
Ops.append(CallSite(CI).arg_begin(),
CallSite(CI).arg_end() - getNumShapeArgs(CI));
} else if (isa<BitCastInst>(Expr)) {
// Special case bitcasts, which are used to materialize matrixes from
// non-matrix ops.
write("matrix");
return;
} else {
Ops.append(I->value_op_begin(), I->value_op_end());
write(std::string(I->getOpcodeName()));
}
write(std::string("("));
unsigned NumOpsToBreak = 1;
if (match(Expr, m_Intrinsic<Intrinsic::matrix_columnwise_load>()))
NumOpsToBreak = 2;
for (Value *Op : Ops) {
if (Ops.size() > NumOpsToBreak)
lineBreak();
maybeIndent(Indent + 1);
if (isMatrix(Op))
linearizeExpr(Op, Indent + 1, Reused);
else
write(Op);
if (Op != Ops.back())
write(", ");
}
write(")");
}
const std::string &getResult() {
Stream.flush();
return Str;
}
};
/// Generate remarks for matrix operations in a function. To generate remarks
/// for matrix expressions, the following approach is used:
/// 1. Collect leafs of matrix expressions (done in
/// RemarkGenerator::getExpressionLeaves). Leaves are lowered matrix
/// instructions without other matrix users (like stores).
///
/// 2. For each leaf, create a remark containing a linearizied version of the
/// matrix expression.
///
/// TODO:
/// * Summarize number of vector instructions generated for each expression.
/// * Account for shared sub-expressions.
/// * Propagate matrix remarks up the inlining chain.
struct RemarkGenerator {
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
OptimizationRemarkEmitter &ORE;
const DataLayout &DL;
RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
OptimizationRemarkEmitter &ORE, const DataLayout &DL)
: Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
/// Return all leafs of matrix expressions. Those are instructions in
/// Inst2ColumnMatrix returing void. Currently that should only include
/// stores.
SmallVector<Value *, 4> getExpressionLeaves() {
SmallVector<Value *, 4> Leaves;
for (auto &KV : Inst2ColumnMatrix)
if (KV.first->getType()->isVoidTy())
Leaves.push_back(KV.first);
return Leaves;
}
void emitRemarks() {
if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
return;
// Find leafs of matrix expressions.
auto Leaves = getExpressionLeaves();
// Generate remarks for each leaf.
for (auto *L : Leaves) {
OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
cast<Instruction>(L)->getDebugLoc(),
cast<Instruction>(L)->getParent());
Rem << "Lowered matrix expression ";
Rem << ("\n" + linearize(L, DL));
ORE.emit(Rem);
}
}
std::string linearize(Value *L, const DataLayout &DL) {
ExprLinearizer Lin(DL, Inst2ColumnMatrix);
Lin.linearizeExpr(L, 0, false);
return Lin.getResult();
}
};
}; };
} // namespace } // namespace
PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
FunctionAnalysisManager &AM) { FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F);
LowerMatrixIntrinsics LMT(F, TTI); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
LowerMatrixIntrinsics LMT(F, TTI, ORE);
if (LMT.Visit()) { if (LMT.Visit()) {
PreservedAnalyses PA; PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>(); PA.preserveSet<CFGAnalyses>();
@ -871,14 +1168,16 @@ public:
} }
bool runOnFunction(Function &F) override { bool runOnFunction(Function &F) override {
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
LowerMatrixIntrinsics LMT(F, *TTI); auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
LowerMatrixIntrinsics LMT(F, TTI, ORE);
bool C = LMT.Visit(); bool C = LMT.Visit();
return C; return C;
} }
void getAnalysisUsage(AnalysisUsage &AU) const override { void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.setPreservesCFG(); AU.setPreservesCFG();
} }
}; };
@ -888,6 +1187,7 @@ static const char pass_name[] = "Lower the matrix intrinsics";
char LowerMatrixIntrinsicsLegacyPass::ID = 0; char LowerMatrixIntrinsicsLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
false, false) false, false)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
false, false) false, false)

View File

@ -0,0 +1,195 @@
; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics < %s 2>&1 | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "aarch64-apple-ios"
; CHECK-LABEL: remark: test.h:40:20: Lowered matrix expression
; CHECK-NEXT: store(
; CHECK-NEXT: transpose.2x6.double(load(addr %A)),
; CHECK-NEXT: addr %B)
define void @transpose(<12 x double>* %A, <12 x double>* %B) !dbg !23 {
%load = load <12 x double>, <12 x double>* %A, !dbg !24
%t = call <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double> %load, i32 2, i32 6), !dbg !24
store <12 x double> %t, <12 x double>* %B, !dbg !24
ret void
}
declare <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double>, i32, i32)
; CHECK-LABEL: remark: test.h:50:20: Lowered matrix expression
; CHECK-NEXT: store(
; CHECK-NEXT: multiply.2x6.6x2.double(
; CHECK-NEXT: load(addr %A),
; CHECK-NEXT: load(addr %B)),
; CHECK-NEXT: addr %C)
define void @multiply(<12 x double>* %A, <12 x double>* %B, <4 x double>* %C) !dbg !25 {
%A.matrix = load <12 x double>, <12 x double>* %A, !dbg !26
%B.matrix = load <12 x double>, <12 x double>* %B, !dbg !26
%t = call <4 x double> @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26
store <4 x double> %t, <4 x double>* %C, !dbg !26
ret void
}
declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32)
; CHECK-LABEL: remark: test.h:60:20: Lowered matrix expression
; CHECK-NEXT: store(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5),
; CHECK-NEXT: addr %B)
define void @columnwise.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 {
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !28
store <9 x double> %A.matrix, <9 x double>* %B, !dbg !28
ret void
}
declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32)
; CHECK-LABEL: remark: test.h:70:20: Lowered matrix expression
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5),
; CHECK-NEXT: addr %B,
; CHECK-NEXT: 10)
define void @columnwise.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 {
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !30
call void @llvm.matrix.columnwise.store(<9 x double> %A.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !30
ret void
}
declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32, i32)
; CHECK-LABEL: remark: test.h:80:20: Lowered matrix expression
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: fmul(
; CHECK-NEXT: fadd(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5)
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: addr %B,
; CHECK-NEXT: 10)
define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
ret void
}
; CHECK-LABEL: remark: test.h:90:20: Lowered matrix expression
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: fmul(
; CHECK-NEXT: fadd(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5)
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: addr %B,
; CHECK-NEXT: 10)
; CHECK-NEXT: remark: test.h:90:20: Lowered matrix expression
; CHECK-NEXT: store(
; CHECK-NEXT: multiply.2x6.6x2.double(
; CHECK-NEXT: load(addr %C),
; CHECK-NEXT: load(addr %D)),
; CHECK-NEXT: addr %E)
define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
%C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
%D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34
%Mult.matrix = call <4 x double> @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34
store <4 x double> %Mult.matrix, <4 x double>* %E, !dbg !34
ret void
}
; CHECK-LABEL: remark: test.h:100:20: Lowered matrix expression
; CHECK-NEXT: columnwise.store.3x3.double(
; CHECK-NEXT: fmul(
; CHECK-NEXT: fadd(
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5)
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
; CHECK-NEXT: stack addr %B,
; CHECK-NEXT: 10)
define void @stackaddresses(<9 x double>* %A) !dbg !35 {
%B = alloca <9 x double>
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
ret void
}
; CHECK-LABEL: remark: test.h:30:20: Lowered matrix expression
; CHECK-NEXT: store(
; CHECK-NEXT: transpose.5x3.double(load(addr %A)),
; CHECK-NEXT: stack addr %s1)
%S1 = type {<15 x double>*}
define void @get_underlying_object(%S1* %A) !dbg !21 {
entry:
%s1 = alloca <15 x double>, !dbg !22
%a1 = getelementptr %S1, %S1* %A, i32 0, i32 0, !dbg !22
%a2 = load <15 x double>*, <15 x double>** %a1, !dbg !22
%av = load <15 x double>, <15 x double>* %a2, !dbg !22
%s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
%s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
%t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
store <15 x double> %t, <15 x double>* %s3, !dbg !22
ret void
}
declare <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double>, i32, i32)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test.h", directory: "/test")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!6 = !DISubroutineType(types: !7)
!7 = !{null, !8, !8, !11}
!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!12 = !{!13}
!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
!14 = !DILocation(line: 1, column: 27, scope: !5)
!5 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!19 = !DILocation(line: 10, column: 20, scope: !5)
!20 = !DILocation(line: 10, column: 10, scope: !5)
!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!22 = !DILocation(line: 30, column: 20, scope: !21)
!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!24 = !DILocation(line: 40, column: 20, scope: !23)
!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!26 = !DILocation(line: 50, column: 20, scope: !25)
!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!28 = !DILocation(line: 60, column: 20, scope: !27)
!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!30 = !DILocation(line: 70, column: 20, scope: !29)
!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!32 = !DILocation(line: 80, column: 20, scope: !31)
!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!34 = !DILocation(line: 90, column: 20, scope: !33)
!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
!36 = !DILocation(line: 100, column: 20, scope: !35)