1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00
llvm-mirror/utils/PerfectShuffle/PerfectShuffle.cpp
Chandler Carruth ae65e281f3 Update the file headers across all of the LLVM projects in the monorepo
to reflect the new license.

We understand that people may be surprised that we're moving the header
entirely to discuss the new license. We checked this carefully with the
Foundation's lawyer and we believe this is the correct approach.

Essentially, all code in the project is now made available by the LLVM
project under our new license, so you will see that the license headers
include that license only. Some of our contributors have contributed
code under our old license, and accordingly, we have retained a copy of
our old license notice in the top-level files in each project and
repository.

llvm-svn: 351636
2019-01-19 08:50:56 +00:00

572 lines
18 KiB
C++

//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file computes an optimal sequence of instructions for doing all shuffles
// of two 4-element vectors. With a release build and when configured to emit
// an altivec instruction table, this takes about 30s to run on a 2.7Ghz
// PowerPC G5.
//
//===----------------------------------------------------------------------===//
#include <cassert>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <vector>
struct Operator;
// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes
// an element from that value of the input vectors. A value of 8 means the
// entry is undefined.
// Mask manipulation functions.
static inline unsigned short MakeMask(unsigned V0, unsigned V1,
unsigned V2, unsigned V3) {
return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
}
/// getMaskElt - Return element N of the specified mask.
static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
return (Mask >> ((3-Elt)*4)) & 0xF;
}
static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
unsigned FieldShift = ((3-Elt)*4);
return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
}
// Reject elements where the values are 9-15.
static bool isValidMask(unsigned short Mask) {
unsigned short UndefBits = Mask & 0x8888;
return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
}
/// hasUndefElements - Return true if any of the elements in the mask are undefs
///
static bool hasUndefElements(unsigned short Mask) {
return (Mask & 0x8888) != 0;
}
/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
/// including undef values..
static bool isOnlyLHSMask(unsigned short Mask) {
return (Mask & 0x4444) == 0;
}
/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
/// refer to the LHS only (for when one argument value is passed into the same
/// function twice).
#if 0
static unsigned short getLHSOnlyMask(unsigned short Mask) {
return Mask & 0xBBBB; // Keep only LHS and Undefs.
}
#endif
/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
static unsigned getCompressedMask(unsigned short Mask) {
return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3);
}
static void PrintMask(unsigned i, std::ostream &OS) {
OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
<< "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
<< "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
<< "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
<< ">";
}
/// ShuffleVal - This represents a shufflevector operation.
struct ShuffleVal {
Operator *Op; // The Operation used to generate this value.
unsigned Cost; // Number of instrs used to generate this value.
unsigned short Arg0, Arg1; // Input operands for this value.
ShuffleVal() : Cost(1000000) {}
};
/// ShufTab - This is the actual shuffle table that we are trying to generate.
///
static ShuffleVal ShufTab[65536];
/// TheOperators - All of the operators that this target supports.
static std::vector<Operator*> TheOperators;
/// Operator - This is a vector operation that is available for use.
struct Operator {
const char *Name;
unsigned short ShuffleMask;
unsigned short OpNum;
unsigned Cost;
Operator(unsigned short shufflemask, const char *name, unsigned opnum,
unsigned cost = 1)
: Name(name), ShuffleMask(shufflemask), OpNum(opnum),Cost(cost) {
TheOperators.push_back(this);
}
~Operator() {
assert(TheOperators.back() == this);
TheOperators.pop_back();
}
bool isOnlyLHSOperator() const {
return isOnlyLHSMask(ShuffleMask);
}
const char *getName() const { return Name; }
unsigned getCost() const { return Cost; }
unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
// Extract the elements from LHSMask and RHSMask, as appropriate.
unsigned Result = 0;
for (unsigned i = 0; i != 4; ++i) {
unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
unsigned ResElt;
if (SrcElt < 4)
ResElt = getMaskElt(LHSMask, SrcElt);
else if (SrcElt < 8)
ResElt = getMaskElt(RHSMask, SrcElt-4);
else {
assert(SrcElt == 8 && "Bad src elt!");
ResElt = 8;
}
Result |= ResElt << (4*i);
}
return Result;
}
};
static const char *getZeroCostOpName(unsigned short Op) {
if (ShufTab[Op].Arg0 == 0x0123)
return "LHS";
else if (ShufTab[Op].Arg0 == 0x4567)
return "RHS";
else {
assert(0 && "bad zero cost operation");
abort();
}
}
static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
unsigned short ThisOp = Vals[ValNo];
std::cerr << "t" << ValNo;
PrintMask(ThisOp, std::cerr);
std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
} else {
// Figure out what tmp # it is.
for (unsigned i = 0; ; ++i)
if (Vals[i] == ShufTab[ThisOp].Arg0) {
std::cerr << "t" << i;
break;
}
}
if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
std::cerr << ", ";
if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
} else {
// Figure out what tmp # it is.
for (unsigned i = 0; ; ++i)
if (Vals[i] == ShufTab[ThisOp].Arg1) {
std::cerr << "t" << i;
break;
}
}
}
std::cerr << ") ";
}
static unsigned getNumEntered() {
unsigned Count = 0;
for (unsigned i = 0; i != 65536; ++i)
Count += ShufTab[i].Cost < 100;
return Count;
}
static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
unsigned &NumVals) {
if (ShufTab[Elt].Cost == 0) return;
// If this value has already been evaluated, it is free. FIXME: match undefs.
for (unsigned i = 0, e = NumVals; i != e; ++i)
if (Vals[i] == Elt) return;
// Otherwise, get the operands of the value, then add it.
unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
if (ShufTab[Arg0].Cost)
EvaluateOps(Arg0, Vals, NumVals);
if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
EvaluateOps(Arg1, Vals, NumVals);
Vals[NumVals++] = Elt;
}
int main() {
// Seed the table with accesses to the LHS and RHS.
ShufTab[0x0123].Cost = 0;
ShufTab[0x0123].Op = nullptr;
ShufTab[0x0123].Arg0 = 0x0123;
ShufTab[0x4567].Cost = 0;
ShufTab[0x4567].Op = nullptr;
ShufTab[0x4567].Arg0 = 0x4567;
// Seed the first-level of shuffles, shuffles whose inputs are the input to
// the vectorshuffle operation.
bool MadeChange = true;
unsigned OpCount = 0;
while (MadeChange) {
MadeChange = false;
++OpCount;
std::cerr << "Starting iteration #" << OpCount << " with "
<< getNumEntered() << " entries established.\n";
// Scan the table for two reasons: First, compute the maximum cost of any
// operation left in the table. Second, make sure that values with undefs
// have the cheapest alternative that they match.
unsigned MaxCost = ShufTab[0].Cost;
for (unsigned i = 1; i != 0x8889; ++i) {
if (!isValidMask(i)) continue;
if (ShufTab[i].Cost > MaxCost)
MaxCost = ShufTab[i].Cost;
// If this value has an undef, make it be computed the cheapest possible
// way of any of the things that it matches.
if (hasUndefElements(i)) {
// This code is a little bit tricky, so here's the idea: consider some
// permutation, like 7u4u. To compute the lowest cost for 7u4u, we
// need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If
// there are 3 undefs, the number rises to 729 entries we have to scan,
// and for the 4 undef case, we have to scan the whole table.
//
// Instead of doing this huge amount of scanning, we process the table
// entries *in order*, and use the fact that 'u' is 8, larger than any
// valid index. Given an entry like 7u4u then, we only need to scan
// 7[0-7]4u - 8 entries. We can get away with this, because we already
// know that each of 704u, 714u, 724u, etc contain the minimum value of
// all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
unsigned UndefIdx;
if (i & 0x8000)
UndefIdx = 0;
else if (i & 0x0800)
UndefIdx = 1;
else if (i & 0x0080)
UndefIdx = 2;
else if (i & 0x0008)
UndefIdx = 3;
else
abort();
unsigned MinVal = i;
unsigned MinCost = ShufTab[i].Cost;
// Scan the 8 entries.
for (unsigned j = 0; j != 8; ++j) {
unsigned NewElt = setMaskElt(i, UndefIdx, j);
if (ShufTab[NewElt].Cost < MinCost) {
MinCost = ShufTab[NewElt].Cost;
MinVal = NewElt;
}
}
// If we found something cheaper than what was here before, use it.
if (i != MinVal) {
MadeChange = true;
ShufTab[i] = ShufTab[MinVal];
}
}
}
for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
if (!isValidMask(LHS)) continue;
if (ShufTab[LHS].Cost > 1000) continue;
// If nothing involving this operand could possibly be cheaper than what
// we already have, don't consider it.
if (ShufTab[LHS].Cost + 1 >= MaxCost)
continue;
for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
Operator *Op = TheOperators[opnum];
// Evaluate op(LHS,LHS)
unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
unsigned Cost = ShufTab[LHS].Cost + Op->getCost();
if (Cost < ShufTab[ResultMask].Cost) {
ShufTab[ResultMask].Cost = Cost;
ShufTab[ResultMask].Op = Op;
ShufTab[ResultMask].Arg0 = LHS;
ShufTab[ResultMask].Arg1 = LHS;
MadeChange = true;
}
// If this is a two input instruction, include the op(x,y) cases. If
// this is a one input instruction, skip this.
if (Op->isOnlyLHSOperator()) continue;
for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
if (!isValidMask(RHS)) continue;
if (ShufTab[RHS].Cost > 1000) continue;
// If nothing involving this operand could possibly be cheaper than
// what we already have, don't consider it.
if (ShufTab[RHS].Cost + 1 >= MaxCost)
continue;
// Evaluate op(LHS,RHS)
unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
if (ShufTab[ResultMask].Cost <= OpCount ||
ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
continue;
// Figure out the cost to evaluate this, knowing that CSE's only need
// to be evaluated once.
unsigned short Vals[30];
unsigned NumVals = 0;
EvaluateOps(LHS, Vals, NumVals);
EvaluateOps(RHS, Vals, NumVals);
unsigned Cost = NumVals + Op->getCost();
if (Cost < ShufTab[ResultMask].Cost) {
ShufTab[ResultMask].Cost = Cost;
ShufTab[ResultMask].Op = Op;
ShufTab[ResultMask].Arg0 = LHS;
ShufTab[ResultMask].Arg1 = RHS;
MadeChange = true;
}
}
}
}
}
std::cerr << "Finished Table has " << getNumEntered()
<< " entries established.\n";
unsigned CostArray[10] = { 0 };
// Compute a cost histogram.
for (unsigned i = 0; i != 65536; ++i) {
if (!isValidMask(i)) continue;
if (ShufTab[i].Cost > 9)
++CostArray[9];
else
++CostArray[ShufTab[i].Cost];
}
for (unsigned i = 0; i != 9; ++i)
if (CostArray[i])
std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
if (CostArray[9])
std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
// Build up the table to emit.
std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
for (unsigned i = 0; i != 0x8889; ++i) {
if (!isValidMask(i)) continue;
// CostSat - The cost of this operation saturated to two bits.
unsigned CostSat = ShufTab[i].Cost;
if (CostSat > 4) CostSat = 4;
if (CostSat == 0) CostSat = 1;
--CostSat; // Cost is now between 0-3.
unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
assert(OpNum < 16 && "Too few bits to encode operation!");
unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
// Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
// LHS, and 13 bits of RHS = 32 bits.
unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
std::cout << " " << std::setw(10) << Val << "U, // ";
PrintMask(i, std::cout);
std::cout << ": Cost " << ShufTab[i].Cost;
std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
std::cout << " ";
if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
std::cout << getZeroCostOpName(ShufTab[i].Arg0);
} else {
PrintMask(ShufTab[i].Arg0, std::cout);
}
if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
std::cout << ", ";
if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
std::cout << getZeroCostOpName(ShufTab[i].Arg1);
} else {
PrintMask(ShufTab[i].Arg1, std::cout);
}
}
std::cout << "\n";
}
std::cout << " 0\n};\n";
if (0) {
// Print out the table.
for (unsigned i = 0; i != 0x8889; ++i) {
if (!isValidMask(i)) continue;
if (ShufTab[i].Cost < 1000) {
PrintMask(i, std::cerr);
std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
unsigned short Vals[30];
unsigned NumVals = 0;
EvaluateOps(i, Vals, NumVals);
for (unsigned j = 0, e = NumVals; j != e; ++j)
PrintOperation(j, Vals);
std::cerr << "\n";
}
}
}
}
#ifdef GENERATE_ALTIVEC
///===---------------------------------------------------------------------===//
/// The altivec instruction definitions. This is the altivec-specific part of
/// this file.
///===---------------------------------------------------------------------===//
// Note that the opcode numbers here must match those in the PPC backend.
enum {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VMRGHW,
OP_VMRGLW,
OP_VSPLTISW0,
OP_VSPLTISW1,
OP_VSPLTISW2,
OP_VSPLTISW3,
OP_VSLDOI4,
OP_VSLDOI8,
OP_VSLDOI12
};
struct vmrghw : public Operator {
vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
} the_vmrghw;
struct vmrglw : public Operator {
vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
} the_vmrglw;
template<unsigned Elt>
struct vspltisw : public Operator {
vspltisw(const char *N, unsigned Opc)
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
};
vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
template<unsigned N>
struct vsldoi : public Operator {
vsldoi(const char *Name, unsigned Opc)
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
}
};
vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
#endif
#define GENERATE_NEON
#ifdef GENERATE_NEON
enum {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VREV,
OP_VDUP0,
OP_VDUP1,
OP_VDUP2,
OP_VDUP3,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR // VTRN, right result
};
struct vrev : public Operator {
vrev() : Operator(0x1032, "vrev", OP_VREV) {}
} the_vrev;
template<unsigned Elt>
struct vdup : public Operator {
vdup(const char *N, unsigned Opc)
: Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
};
vdup<0> the_vdup0("vdup0", OP_VDUP0);
vdup<1> the_vdup1("vdup1", OP_VDUP1);
vdup<2> the_vdup2("vdup2", OP_VDUP2);
vdup<3> the_vdup3("vdup3", OP_VDUP3);
template<unsigned N>
struct vext : public Operator {
vext(const char *Name, unsigned Opc)
: Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
}
};
vext<1> the_vext1("vext1", OP_VEXT1);
vext<2> the_vext2("vext2", OP_VEXT2);
vext<3> the_vext3("vext3", OP_VEXT3);
struct vuzpl : public Operator {
vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {}
} the_vuzpl;
struct vuzpr : public Operator {
vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {}
} the_vuzpr;
struct vzipl : public Operator {
vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {}
} the_vzipl;
struct vzipr : public Operator {
vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {}
} the_vzipr;
struct vtrnl : public Operator {
vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {}
} the_vtrnl;
struct vtrnr : public Operator {
vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {}
} the_vtrnr;
#endif