2018-02-24 00:49:32 +01:00
|
|
|
//===-------------- BPFMIPeephole.cpp - MI Peephole Cleanups -------------===//
|
|
|
|
//
|
2019-01-19 09:50:56 +01:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2018-02-24 00:49:32 +01:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This pass performs peephole optimizations to cleanup ugly code sequences at
|
|
|
|
// MachineInstruction layer.
|
|
|
|
//
|
2018-03-13 07:47:06 +01:00
|
|
|
// Currently, there are two optimizations implemented:
|
|
|
|
// - One pre-RA MachineSSA pass to eliminate type promotion sequences, those
|
|
|
|
// zero extend 32-bit subregisters to 64-bit registers, if the compiler
|
|
|
|
// could prove the subregisters is defined by 32-bit operations in which
|
|
|
|
// case the upper half of the underlying 64-bit registers were zeroed
|
|
|
|
// implicitly.
|
2018-02-24 00:49:32 +01:00
|
|
|
//
|
2018-03-13 07:47:06 +01:00
|
|
|
// - One post-RA PreEmit pass to do final cleanup on some redundant
|
|
|
|
// instructions generated due to bad RA on subregister.
|
2018-02-24 00:49:32 +01:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "BPF.h"
|
|
|
|
#include "BPFInstrInfo.h"
|
|
|
|
#include "BPFTargetMachine.h"
|
|
|
|
#include "llvm/ADT/Statistic.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2019-10-19 03:31:09 +02:00
|
|
|
#include "llvm/Support/Debug.h"
|
2020-04-10 17:23:55 +02:00
|
|
|
#include <set>
|
2018-02-24 00:49:32 +01:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2018-03-13 07:47:03 +01:00
|
|
|
#define DEBUG_TYPE "bpf-mi-zext-elim"
|
2018-02-24 00:49:32 +01:00
|
|
|
|
2018-03-13 07:47:03 +01:00
|
|
|
STATISTIC(ZExtElemNum, "Number of zero extension shifts eliminated");
|
2018-02-24 00:49:32 +01:00
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
struct BPFMIPeephole : public MachineFunctionPass {
|
|
|
|
|
|
|
|
static char ID;
|
|
|
|
const BPFInstrInfo *TII;
|
|
|
|
MachineFunction *MF;
|
|
|
|
MachineRegisterInfo *MRI;
|
|
|
|
|
|
|
|
BPFMIPeephole() : MachineFunctionPass(ID) {
|
|
|
|
initializeBPFMIPeepholePass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Initialize class variables.
|
|
|
|
void initialize(MachineFunction &MFParm);
|
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
bool isCopyFrom32Def(MachineInstr *CopyMI);
|
|
|
|
bool isInsnFrom32Def(MachineInstr *DefInsn);
|
|
|
|
bool isPhiFrom32Def(MachineInstr *MovMI);
|
2018-03-13 07:47:03 +01:00
|
|
|
bool isMovFrom32Def(MachineInstr *MovMI);
|
|
|
|
bool eliminateZExtSeq(void);
|
[BPF] Remove unnecessary MOV_32_64 instructions
Commit 13f6c81c5d9a ("[BPF] simplify zero extension
with MOV_32_64") tried to use MOV_32_64 instructions
instead of lshift/rshift instructions for zero extension.
This has the benefit to remove the number of instructions
and may help verifier too.
But the same commit also removed the old MOV_32_64
pruning as it deems unsafe as MOV_32_64 does have the
side effect, zeroing out the top 32bit in the register.
This caused the following failure in kernel selftest
test_cls_redirect.o. In linux kernel, we have
struct __sk_buff {
__u32 data;
__u32 data_end;
};
The compiler will generate 32bit load for __sk_buff->data
and __sk_buff->data_end. But kernel verifier will actually
loads an address (64bit address on 64bit kernel) to the
result register. In this particular example, the explicit zext
was not optimized away and destroyed top 32bit
address and the verifier rejected the program :
w2 = *(u32 *)(r1 + 76)
...
r2 = w2 /* MOV_32_64: this will clear top 32bit */
Currently, if the load and the zext are next to each other, the
instruction pattern match can actually capture this to
avoid MOV_32_64, e.g., in BPFInstrInfo.td, we have
def : Pat<(i64 (zextloadi32 ADDRri:$src)),
(SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
However, if they are not next to each other, LDW32 and
MOV_32_64 are generated, which may cause the above mentioned
problem.
BPF Backend already tried to optimize away pattern
mov_32_64 + lshift + rshift
Commit 13f6c81c5d9a may generate mov_32_64 not followed by shifts.
This patch added optimization for only mov_32_64 too.
Differential Revision: https://reviews.llvm.org/D81048
2020-06-02 09:47:18 +02:00
|
|
|
bool eliminateZExt(void);
|
2018-02-24 00:49:32 +01:00
|
|
|
|
2019-11-22 09:20:10 +01:00
|
|
|
std::set<MachineInstr *> PhiInsns;
|
|
|
|
|
2018-02-24 00:49:32 +01:00
|
|
|
public:
|
|
|
|
|
|
|
|
// Main entry point for this pass.
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
if (skipFunction(MF.getFunction()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
initialize(MF);
|
|
|
|
|
[BPF] Remove unnecessary MOV_32_64 instructions
Commit 13f6c81c5d9a ("[BPF] simplify zero extension
with MOV_32_64") tried to use MOV_32_64 instructions
instead of lshift/rshift instructions for zero extension.
This has the benefit to remove the number of instructions
and may help verifier too.
But the same commit also removed the old MOV_32_64
pruning as it deems unsafe as MOV_32_64 does have the
side effect, zeroing out the top 32bit in the register.
This caused the following failure in kernel selftest
test_cls_redirect.o. In linux kernel, we have
struct __sk_buff {
__u32 data;
__u32 data_end;
};
The compiler will generate 32bit load for __sk_buff->data
and __sk_buff->data_end. But kernel verifier will actually
loads an address (64bit address on 64bit kernel) to the
result register. In this particular example, the explicit zext
was not optimized away and destroyed top 32bit
address and the verifier rejected the program :
w2 = *(u32 *)(r1 + 76)
...
r2 = w2 /* MOV_32_64: this will clear top 32bit */
Currently, if the load and the zext are next to each other, the
instruction pattern match can actually capture this to
avoid MOV_32_64, e.g., in BPFInstrInfo.td, we have
def : Pat<(i64 (zextloadi32 ADDRri:$src)),
(SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
However, if they are not next to each other, LDW32 and
MOV_32_64 are generated, which may cause the above mentioned
problem.
BPF Backend already tried to optimize away pattern
mov_32_64 + lshift + rshift
Commit 13f6c81c5d9a may generate mov_32_64 not followed by shifts.
This patch added optimization for only mov_32_64 too.
Differential Revision: https://reviews.llvm.org/D81048
2020-06-02 09:47:18 +02:00
|
|
|
// First try to eliminate (zext, lshift, rshift) and then
|
|
|
|
// try to eliminate zext.
|
|
|
|
bool ZExtSeqExist, ZExtExist;
|
|
|
|
ZExtSeqExist = eliminateZExtSeq();
|
|
|
|
ZExtExist = eliminateZExt();
|
|
|
|
return ZExtSeqExist || ZExtExist;
|
2018-02-24 00:49:32 +01:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Initialize class variables.
|
|
|
|
void BPFMIPeephole::initialize(MachineFunction &MFParm) {
|
|
|
|
MF = &MFParm;
|
|
|
|
MRI = &MF->getRegInfo();
|
|
|
|
TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
|
bpf: fix wrong truncation elimination when there is back-edge/loop
Currently, BPF backend is doing truncation elimination. If one truncation
is performed on a value defined by narrow loads, then it could be redundant
given BPF loads zero extend the destination register implicitly.
When the definition of the truncated value is a merging value (PHI node)
that could come from different code paths, then checks need to be done on
all possible code paths.
Above described optimization was introduced as r306685, however it doesn't
work when there is back-edge, for example when loop is used inside BPF
code.
For example for the following code, a zero-extended value should be stored
into b[i], but the "and reg, 0xffff" is wrongly eliminated which then
generates corrupted data.
void cal1(unsigned short *a, unsigned long *b, unsigned int k)
{
unsigned short e;
e = *a;
for (unsigned int i = 0; i < k; i++) {
b[i] = e;
e = ~e;
}
}
The reason is r306685 was trying to do the PHI node checks inside isel
DAG2DAG phase, and the checks are done on MachineInstr. This is actually
wrong, because MachineInstr is being built during isel phase and the
associated information is not completed yet. A quick search shows none
target other than BPF is access MachineInstr info during isel phase.
For an PHI node, when you reached it during isel phase, it may have all
predecessors linked, but not successors. It seems successors are linked to
PHI node only when doing SelectionDAGISel::FinishBasicBlock and this
happens later than PreprocessISelDAG hook.
Previously, BPF program doesn't allow loop, there is probably the reason
why this bug was not exposed.
This patch therefore fixes the bug by the following approach:
- The existing truncation elimination code and the associated
"load_to_vreg_" records are removed.
- Instead, implement truncation elimination using MachineSSA pass, this
is where all information are built, and keep the pass together with other
similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move
elimination logic is updated accordingly.
- Unit testcase included + no compilation errors for kernel BPF selftest.
Patch Review
===
Patch was sent to and reviewed by BPF community at:
https://lore.kernel.org/bpf
Reported-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
llvm-svn: 375007
2019-10-16 17:27:59 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n");
|
2018-02-24 00:49:32 +01:00
|
|
|
}
|
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
bool BPFMIPeephole::isCopyFrom32Def(MachineInstr *CopyMI)
|
2018-03-13 07:47:03 +01:00
|
|
|
{
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
MachineOperand &opnd = CopyMI->getOperand(1);
|
2018-02-24 00:49:32 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
if (!opnd.isReg())
|
|
|
|
return false;
|
2018-03-13 07:47:07 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
// Return false if getting value from a 32bit physical register.
|
|
|
|
// Most likely, this physical register is aliased to
|
|
|
|
// function call return value or current function parameters.
|
|
|
|
Register Reg = opnd.getReg();
|
|
|
|
if (!Register::isVirtualRegister(Reg))
|
2018-03-13 07:47:03 +01:00
|
|
|
return false;
|
2018-02-24 00:49:32 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
if (MRI->getRegClass(Reg) == &BPF::GPRRegClass)
|
|
|
|
return false;
|
2018-03-13 07:47:04 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
MachineInstr *DefInsn = MRI->getVRegDef(Reg);
|
|
|
|
if (!isInsnFrom32Def(DefInsn))
|
|
|
|
return false;
|
2018-03-13 07:47:04 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
return true;
|
|
|
|
}
|
2018-03-13 07:47:04 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
bool BPFMIPeephole::isPhiFrom32Def(MachineInstr *PhiMI)
|
|
|
|
{
|
|
|
|
for (unsigned i = 1, e = PhiMI->getNumOperands(); i < e; i += 2) {
|
|
|
|
MachineOperand &opnd = PhiMI->getOperand(i);
|
bpf: Tighten subregister definition check
The current subregister definition check stops after the MOV_32_64
instruction.
This means we are thinking all the following instruction sequences
are safe to be eliminated:
MOV_32_64 rB, wA
SLL_ri rB, rB, 32
SRL_ri rB, rB, 32
However, this is *not* true. The source subregister wA of MOV_32_64 could
come from a implicit truncation of 64-bit register in which case the high
bits of the 64-bit register is not zeroed, therefore we can't eliminate
above sequence.
For example, for i32_val, we shouldn't do the elimination:
long long bar ();
int foo (int b, int c)
{
unsigned int i32_val = (unsigned int) bar();
if (i32_val < 10)
return b;
else
return c;
}
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
llvm-svn: 327365
2018-03-13 07:47:00 +01:00
|
|
|
|
|
|
|
if (!opnd.isReg())
|
2018-03-13 07:47:03 +01:00
|
|
|
return false;
|
bpf: Tighten subregister definition check
The current subregister definition check stops after the MOV_32_64
instruction.
This means we are thinking all the following instruction sequences
are safe to be eliminated:
MOV_32_64 rB, wA
SLL_ri rB, rB, 32
SRL_ri rB, rB, 32
However, this is *not* true. The source subregister wA of MOV_32_64 could
come from a implicit truncation of 64-bit register in which case the high
bits of the 64-bit register is not zeroed, therefore we can't eliminate
above sequence.
For example, for i32_val, we shouldn't do the elimination:
long long bar ();
int foo (int b, int c)
{
unsigned int i32_val = (unsigned int) bar();
if (i32_val < 10)
return b;
else
return c;
}
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
llvm-svn: 327365
2018-03-13 07:47:00 +01:00
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
|
|
|
|
if (!PhiDef)
|
|
|
|
return false;
|
2019-11-22 09:20:10 +01:00
|
|
|
if (PhiDef->isPHI()) {
|
|
|
|
if (PhiInsns.find(PhiDef) != PhiInsns.end())
|
|
|
|
return false;
|
|
|
|
PhiInsns.insert(PhiDef);
|
|
|
|
if (!isPhiFrom32Def(PhiDef))
|
|
|
|
return false;
|
|
|
|
}
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
if (PhiDef->getOpcode() == BPF::COPY && !isCopyFrom32Def(PhiDef))
|
2019-08-02 01:27:28 +02:00
|
|
|
return false;
|
bpf: Tighten subregister definition check
The current subregister definition check stops after the MOV_32_64
instruction.
This means we are thinking all the following instruction sequences
are safe to be eliminated:
MOV_32_64 rB, wA
SLL_ri rB, rB, 32
SRL_ri rB, rB, 32
However, this is *not* true. The source subregister wA of MOV_32_64 could
come from a implicit truncation of 64-bit register in which case the high
bits of the 64-bit register is not zeroed, therefore we can't eliminate
above sequence.
For example, for i32_val, we shouldn't do the elimination:
long long bar ();
int foo (int b, int c)
{
unsigned int i32_val = (unsigned int) bar();
if (i32_val < 10)
return b;
else
return c;
}
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
llvm-svn: 327365
2018-03-13 07:47:00 +01:00
|
|
|
}
|
|
|
|
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The \p DefInsn instruction defines a virtual register.
|
|
|
|
bool BPFMIPeephole::isInsnFrom32Def(MachineInstr *DefInsn)
|
|
|
|
{
|
|
|
|
if (!DefInsn)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (DefInsn->isPHI()) {
|
2019-11-22 09:20:10 +01:00
|
|
|
if (PhiInsns.find(DefInsn) != PhiInsns.end())
|
|
|
|
return false;
|
|
|
|
PhiInsns.insert(DefInsn);
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
if (!isPhiFrom32Def(DefInsn))
|
|
|
|
return false;
|
|
|
|
} else if (DefInsn->getOpcode() == BPF::COPY) {
|
|
|
|
if (!isCopyFrom32Def(DefInsn))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
|
|
|
|
{
|
|
|
|
MachineInstr *DefInsn = MRI->getVRegDef(MovMI->getOperand(1).getReg());
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << " Def of Mov Src:");
|
|
|
|
LLVM_DEBUG(DefInsn->dump());
|
|
|
|
|
2019-11-22 09:20:10 +01:00
|
|
|
PhiInsns.clear();
|
[BPF] Fix a bug in peephole optimization
One of current peephole optimiations is to remove SLL/SRL if
the sub register has been zero extended. This phase has two bugs
and one limitations.
First, for the physical subregister used in pseudo insn COPY
like below, it permits incorrect optimization.
%0:gpr32 = COPY $w0
...
%4:gpr = MOV_32_64 %0:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The $w0 could be from the return value of a previous function call
and its upper 32-bit value might contain some non-zero values.
The same applies to function arguments.
Second, the current code may permits removing SLL/SRA like below:
%0:gpr32 = COPY $w0
%1:gpr32 = COPY %0:gpr32
...
%4:gpr = MOV_32_64 %1:gpr32
%5:gpr = SLL_ri %4:gpr(tied-def 0), 32
%6:gpr = SRA_ri %5:gpr(tied-def 0), 32
The reason is that it did not follow def-use chain to skip all
intermediate 32bit-to-32bit COPY instructions.
The current implementation is also very conservative for PHI
instructions. If any PHI insn component is another PHI or COPY insn,
it will just permit SLL/SRA.
This patch fixed the issue as follows:
- During def/use chain traversal, if any physical register is read,
SLL/SRA will be preserved as these physical registers are mostly
from function return values or current function arguments.
- Recursively visit all COPY and PHI instructions.
2019-11-20 18:52:29 +01:00
|
|
|
if (!isInsnFrom32Def(DefInsn))
|
|
|
|
return false;
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " One ZExt elim sequence identified.\n");
|
2018-03-13 07:47:07 +01:00
|
|
|
|
2018-03-13 07:47:03 +01:00
|
|
|
return true;
|
2018-02-24 00:49:32 +01:00
|
|
|
}
|
|
|
|
|
2018-03-13 07:47:03 +01:00
|
|
|
bool BPFMIPeephole::eliminateZExtSeq(void) {
|
|
|
|
MachineInstr* ToErase = nullptr;
|
2018-02-24 00:49:32 +01:00
|
|
|
bool Eliminated = false;
|
|
|
|
|
|
|
|
for (MachineBasicBlock &MBB : *MF) {
|
|
|
|
for (MachineInstr &MI : MBB) {
|
2018-03-13 07:47:03 +01:00
|
|
|
// If the previous instruction was marked for elimination, remove it now.
|
|
|
|
if (ToErase) {
|
|
|
|
ToErase->eraseFromParent();
|
|
|
|
ToErase = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Eliminate the 32-bit to 64-bit zero extension sequence when possible.
|
|
|
|
//
|
|
|
|
// MOV_32_64 rB, wA
|
|
|
|
// SLL_ri rB, rB, 32
|
|
|
|
// SRL_ri rB, rB, 32
|
|
|
|
if (MI.getOpcode() == BPF::SRL_ri &&
|
|
|
|
MI.getOperand(2).getImm() == 32) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-15 21:22:08 +02:00
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
|
|
Register ShfReg = MI.getOperand(1).getReg();
|
2018-03-13 07:47:03 +01:00
|
|
|
MachineInstr *SllMI = MRI->getVRegDef(ShfReg);
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "Starting SRL found:");
|
|
|
|
LLVM_DEBUG(MI.dump());
|
2018-03-13 07:47:07 +01:00
|
|
|
|
2018-03-13 07:47:03 +01:00
|
|
|
if (!SllMI ||
|
|
|
|
SllMI->isPHI() ||
|
|
|
|
SllMI->getOpcode() != BPF::SLL_ri ||
|
|
|
|
SllMI->getOperand(2).getImm() != 32)
|
|
|
|
continue;
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " SLL found:");
|
|
|
|
LLVM_DEBUG(SllMI->dump());
|
2018-03-13 07:47:07 +01:00
|
|
|
|
2018-03-13 07:47:03 +01:00
|
|
|
MachineInstr *MovMI = MRI->getVRegDef(SllMI->getOperand(1).getReg());
|
|
|
|
if (!MovMI ||
|
|
|
|
MovMI->isPHI() ||
|
|
|
|
MovMI->getOpcode() != BPF::MOV_32_64)
|
|
|
|
continue;
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " Type cast Mov found:");
|
|
|
|
LLVM_DEBUG(MovMI->dump());
|
2018-03-13 07:47:07 +01:00
|
|
|
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-15 21:22:08 +02:00
|
|
|
Register SubReg = MovMI->getOperand(1).getReg();
|
2018-03-13 07:47:07 +01:00
|
|
|
if (!isMovFrom32Def(MovMI)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs()
|
|
|
|
<< " One ZExt elim sequence failed qualifying elim.\n");
|
2018-03-13 07:47:03 +01:00
|
|
|
continue;
|
2018-03-13 07:47:07 +01:00
|
|
|
}
|
2018-03-13 07:47:03 +01:00
|
|
|
|
|
|
|
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::SUBREG_TO_REG), DstReg)
|
|
|
|
.addImm(0).addReg(SubReg).addImm(BPF::sub_32);
|
|
|
|
|
|
|
|
SllMI->eraseFromParent();
|
|
|
|
MovMI->eraseFromParent();
|
|
|
|
// MI is the right shift, we can't erase it in it's own iteration.
|
|
|
|
// Mark it to ToErase, and erase in the next iteration.
|
|
|
|
ToErase = &MI;
|
|
|
|
ZExtElemNum++;
|
|
|
|
Eliminated = true;
|
2018-02-24 00:49:32 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Eliminated;
|
|
|
|
}
|
|
|
|
|
[BPF] Remove unnecessary MOV_32_64 instructions
Commit 13f6c81c5d9a ("[BPF] simplify zero extension
with MOV_32_64") tried to use MOV_32_64 instructions
instead of lshift/rshift instructions for zero extension.
This has the benefit to remove the number of instructions
and may help verifier too.
But the same commit also removed the old MOV_32_64
pruning as it deems unsafe as MOV_32_64 does have the
side effect, zeroing out the top 32bit in the register.
This caused the following failure in kernel selftest
test_cls_redirect.o. In linux kernel, we have
struct __sk_buff {
__u32 data;
__u32 data_end;
};
The compiler will generate 32bit load for __sk_buff->data
and __sk_buff->data_end. But kernel verifier will actually
loads an address (64bit address on 64bit kernel) to the
result register. In this particular example, the explicit zext
was not optimized away and destroyed top 32bit
address and the verifier rejected the program :
w2 = *(u32 *)(r1 + 76)
...
r2 = w2 /* MOV_32_64: this will clear top 32bit */
Currently, if the load and the zext are next to each other, the
instruction pattern match can actually capture this to
avoid MOV_32_64, e.g., in BPFInstrInfo.td, we have
def : Pat<(i64 (zextloadi32 ADDRri:$src)),
(SUBREG_TO_REG (i64 0), (LDW32 ADDRri:$src), sub_32)>;
However, if they are not next to each other, LDW32 and
MOV_32_64 are generated, which may cause the above mentioned
problem.
BPF Backend already tried to optimize away pattern
mov_32_64 + lshift + rshift
Commit 13f6c81c5d9a may generate mov_32_64 not followed by shifts.
This patch added optimization for only mov_32_64 too.
Differential Revision: https://reviews.llvm.org/D81048
2020-06-02 09:47:18 +02:00
|
|
|
bool BPFMIPeephole::eliminateZExt(void) {
|
|
|
|
MachineInstr* ToErase = nullptr;
|
|
|
|
bool Eliminated = false;
|
|
|
|
|
|
|
|
for (MachineBasicBlock &MBB : *MF) {
|
|
|
|
for (MachineInstr &MI : MBB) {
|
|
|
|
// If the previous instruction was marked for elimination, remove it now.
|
|
|
|
if (ToErase) {
|
|
|
|
ToErase->eraseFromParent();
|
|
|
|
ToErase = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MI.getOpcode() != BPF::MOV_32_64)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Eliminate MOV_32_64 if possible.
|
|
|
|
// MOV_32_64 rA, wB
|
|
|
|
//
|
|
|
|
// If wB has been zero extended, replace it with a SUBREG_TO_REG.
|
|
|
|
// This is to workaround BPF programs where pkt->{data, data_end}
|
|
|
|
// is encoded as u32, but actually the verifier populates them
|
|
|
|
// as 64bit pointer. The MOV_32_64 will zero out the top 32 bits.
|
|
|
|
LLVM_DEBUG(dbgs() << "Candidate MOV_32_64 instruction:");
|
|
|
|
LLVM_DEBUG(MI.dump());
|
|
|
|
|
|
|
|
if (!isMovFrom32Def(&MI))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << "Removing the MOV_32_64 instruction\n");
|
|
|
|
|
|
|
|
Register dst = MI.getOperand(0).getReg();
|
|
|
|
Register src = MI.getOperand(1).getReg();
|
|
|
|
|
|
|
|
// Build a SUBREG_TO_REG instruction.
|
|
|
|
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::SUBREG_TO_REG), dst)
|
|
|
|
.addImm(0).addReg(src).addImm(BPF::sub_32);
|
|
|
|
|
|
|
|
ToErase = &MI;
|
|
|
|
Eliminated = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Eliminated;
|
|
|
|
}
|
|
|
|
|
2018-02-24 00:49:32 +01:00
|
|
|
} // end default namespace
|
|
|
|
|
2018-03-13 07:47:06 +01:00
|
|
|
INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
|
bpf: fix wrong truncation elimination when there is back-edge/loop
Currently, BPF backend is doing truncation elimination. If one truncation
is performed on a value defined by narrow loads, then it could be redundant
given BPF loads zero extend the destination register implicitly.
When the definition of the truncated value is a merging value (PHI node)
that could come from different code paths, then checks need to be done on
all possible code paths.
Above described optimization was introduced as r306685, however it doesn't
work when there is back-edge, for example when loop is used inside BPF
code.
For example for the following code, a zero-extended value should be stored
into b[i], but the "and reg, 0xffff" is wrongly eliminated which then
generates corrupted data.
void cal1(unsigned short *a, unsigned long *b, unsigned int k)
{
unsigned short e;
e = *a;
for (unsigned int i = 0; i < k; i++) {
b[i] = e;
e = ~e;
}
}
The reason is r306685 was trying to do the PHI node checks inside isel
DAG2DAG phase, and the checks are done on MachineInstr. This is actually
wrong, because MachineInstr is being built during isel phase and the
associated information is not completed yet. A quick search shows none
target other than BPF is access MachineInstr info during isel phase.
For an PHI node, when you reached it during isel phase, it may have all
predecessors linked, but not successors. It seems successors are linked to
PHI node only when doing SelectionDAGISel::FinishBasicBlock and this
happens later than PreprocessISelDAG hook.
Previously, BPF program doesn't allow loop, there is probably the reason
why this bug was not exposed.
This patch therefore fixes the bug by the following approach:
- The existing truncation elimination code and the associated
"load_to_vreg_" records are removed.
- Instead, implement truncation elimination using MachineSSA pass, this
is where all information are built, and keep the pass together with other
similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move
elimination logic is updated accordingly.
- Unit testcase included + no compilation errors for kernel BPF selftest.
Patch Review
===
Patch was sent to and reviewed by BPF community at:
https://lore.kernel.org/bpf
Reported-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
llvm-svn: 375007
2019-10-16 17:27:59 +02:00
|
|
|
"BPF MachineSSA Peephole Optimization For ZEXT Eliminate",
|
|
|
|
false, false)
|
2018-02-24 00:49:32 +01:00
|
|
|
|
|
|
|
char BPFMIPeephole::ID = 0;
|
|
|
|
FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); }
|
2018-03-13 07:47:06 +01:00
|
|
|
|
|
|
|
STATISTIC(RedundantMovElemNum, "Number of redundant moves eliminated");
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
struct BPFMIPreEmitPeephole : public MachineFunctionPass {
|
|
|
|
|
|
|
|
static char ID;
|
|
|
|
MachineFunction *MF;
|
|
|
|
const TargetRegisterInfo *TRI;
|
|
|
|
|
|
|
|
BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {
|
|
|
|
initializeBPFMIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Initialize class variables.
|
|
|
|
void initialize(MachineFunction &MFParm);
|
|
|
|
|
|
|
|
bool eliminateRedundantMov(void);
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
// Main entry point for this pass.
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
if (skipFunction(MF.getFunction()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
initialize(MF);
|
|
|
|
|
|
|
|
return eliminateRedundantMov();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Initialize class variables.
|
|
|
|
void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
|
|
|
|
MF = &MFParm;
|
|
|
|
TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
|
2018-03-13 07:47:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
|
|
|
|
MachineInstr* ToErase = nullptr;
|
|
|
|
bool Eliminated = false;
|
|
|
|
|
|
|
|
for (MachineBasicBlock &MBB : *MF) {
|
|
|
|
for (MachineInstr &MI : MBB) {
|
|
|
|
// If the previous instruction was marked for elimination, remove it now.
|
|
|
|
if (ToErase) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " Redundant Mov Eliminated:");
|
|
|
|
LLVM_DEBUG(ToErase->dump());
|
2018-03-13 07:47:06 +01:00
|
|
|
ToErase->eraseFromParent();
|
|
|
|
ToErase = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Eliminate identical move:
|
|
|
|
//
|
|
|
|
// MOV rA, rA
|
|
|
|
//
|
[BPF] simplify zero extension with MOV_32_64
The current pattern matching for zext results in the following code snippet
being produced,
w1 = w0
r1 <<= 32
r1 >>= 32
Because BPF implementations require zero extension on 32bit loads this
both adds a few extra unneeded instructions but also makes it a bit
harder for the verifier to track the r1 register bounds. For example in
this verifier trace we see at the end of the snippet R2 offset is unknown.
However, if we track this correctly we see w1 should have the same bounds
as r8. R8 smax is less than U32 max value so a zero extend load should keep
the same value. Adding a max value of 800 (R8=inv(id=0,smax_value=800)) to
an off=0, as seen in R7 should create a max offset of 800. However at the
end of the snippet we note the R2 max offset is 0xffffFFFF.
R0=inv(id=0,smax_value=800)
R1_w=inv(id=0,umax_value=2147483647,var_off=(0x0; 0x7fffffff))
R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
R9=inv800 R10=fp0 fp-8=mmmm????
58: (1c) w9 -= w8
59: (bc) w1 = w8
60: (67) r1 <<= 32
61: (77) r1 >>= 32
62: (bf) r2 = r7
63: (0f) r2 += r1
64: (bf) r1 = r6
65: (bc) w3 = w9
66: (b7) r4 = 0
67: (85) call bpf_get_stack#67
R0=inv(id=0,smax_value=800)
R1_w=ctx(id=0,off=0,imm=0)
R2_w=map_value(id=0,off=0,ks=4,vs=1600,umax_value=4294967295,var_off=(0x0; 0xffffffff))
R3_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
R4_w=inv0 R6=ctx(id=0,off=0,imm=0)
R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
R9_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
R10=fp0 fp-8=mmmm????
After this patch R1 bounds are not smashed by the <<=32 >>=32 shift and we
get correct bounds on R2 umax_value=800.
Further it reduces 3 insns to 1.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Differential Revision: https://reviews.llvm.org/D73985
2020-05-27 20:18:16 +02:00
|
|
|
// Note that we cannot remove
|
|
|
|
// MOV_32_64 rA, wA
|
|
|
|
// MOV_rr_32 wA, wA
|
|
|
|
// as these two instructions having side effects, zeroing out
|
|
|
|
// top 32 bits of rA.
|
bpf: fix wrong truncation elimination when there is back-edge/loop
Currently, BPF backend is doing truncation elimination. If one truncation
is performed on a value defined by narrow loads, then it could be redundant
given BPF loads zero extend the destination register implicitly.
When the definition of the truncated value is a merging value (PHI node)
that could come from different code paths, then checks need to be done on
all possible code paths.
Above described optimization was introduced as r306685, however it doesn't
work when there is back-edge, for example when loop is used inside BPF
code.
For example for the following code, a zero-extended value should be stored
into b[i], but the "and reg, 0xffff" is wrongly eliminated which then
generates corrupted data.
void cal1(unsigned short *a, unsigned long *b, unsigned int k)
{
unsigned short e;
e = *a;
for (unsigned int i = 0; i < k; i++) {
b[i] = e;
e = ~e;
}
}
The reason is r306685 was trying to do the PHI node checks inside isel
DAG2DAG phase, and the checks are done on MachineInstr. This is actually
wrong, because MachineInstr is being built during isel phase and the
associated information is not completed yet. A quick search shows none
target other than BPF is access MachineInstr info during isel phase.
For an PHI node, when you reached it during isel phase, it may have all
predecessors linked, but not successors. It seems successors are linked to
PHI node only when doing SelectionDAGISel::FinishBasicBlock and this
happens later than PreprocessISelDAG hook.
Previously, BPF program doesn't allow loop, there is probably the reason
why this bug was not exposed.
This patch therefore fixes the bug by the following approach:
- The existing truncation elimination code and the associated
"load_to_vreg_" records are removed.
- Instead, implement truncation elimination using MachineSSA pass, this
is where all information are built, and keep the pass together with other
similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move
elimination logic is updated accordingly.
- Unit testcase included + no compilation errors for kernel BPF selftest.
Patch Review
===
Patch was sent to and reviewed by BPF community at:
https://lore.kernel.org/bpf
Reported-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
llvm-svn: 375007
2019-10-16 17:27:59 +02:00
|
|
|
unsigned Opcode = MI.getOpcode();
|
[BPF] simplify zero extension with MOV_32_64
The current pattern matching for zext results in the following code snippet
being produced,
w1 = w0
r1 <<= 32
r1 >>= 32
Because BPF implementations require zero extension on 32bit loads this
both adds a few extra unneeded instructions but also makes it a bit
harder for the verifier to track the r1 register bounds. For example in
this verifier trace we see at the end of the snippet R2 offset is unknown.
However, if we track this correctly we see w1 should have the same bounds
as r8. R8 smax is less than U32 max value so a zero extend load should keep
the same value. Adding a max value of 800 (R8=inv(id=0,smax_value=800)) to
an off=0, as seen in R7 should create a max offset of 800. However at the
end of the snippet we note the R2 max offset is 0xffffFFFF.
R0=inv(id=0,smax_value=800)
R1_w=inv(id=0,umax_value=2147483647,var_off=(0x0; 0x7fffffff))
R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
R9=inv800 R10=fp0 fp-8=mmmm????
58: (1c) w9 -= w8
59: (bc) w1 = w8
60: (67) r1 <<= 32
61: (77) r1 >>= 32
62: (bf) r2 = r7
63: (0f) r2 += r1
64: (bf) r1 = r6
65: (bc) w3 = w9
66: (b7) r4 = 0
67: (85) call bpf_get_stack#67
R0=inv(id=0,smax_value=800)
R1_w=ctx(id=0,off=0,imm=0)
R2_w=map_value(id=0,off=0,ks=4,vs=1600,umax_value=4294967295,var_off=(0x0; 0xffffffff))
R3_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
R4_w=inv0 R6=ctx(id=0,off=0,imm=0)
R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
R9_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
R10=fp0 fp-8=mmmm????
After this patch R1 bounds are not smashed by the <<=32 >>=32 shift and we
get correct bounds on R2 umax_value=800.
Further it reduces 3 insns to 1.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Differential Revision: https://reviews.llvm.org/D73985
2020-05-27 20:18:16 +02:00
|
|
|
if (Opcode == BPF::MOV_rr) {
|
Apply llvm-prefer-register-over-unsigned from clang-tidy to LLVM
Summary:
This clang-tidy check is looking for unsigned integer variables whose initializer
starts with an implicit cast from llvm::Register and changes the type of the
variable to llvm::Register (dropping the llvm:: where possible).
Partial reverts in:
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
X86FixupLEAs.cpp - Some functions return unsigned and arguably should be MCRegister
X86FrameLowering.cpp - Some functions return unsigned and arguably should be MCRegister
HexagonBitSimplify.cpp - Function takes BitTracker::RegisterRef which appears to be unsigned&
MachineVerifier.cpp - Ambiguous operator==() given MCRegister and const Register
PPCFastISel.cpp - No Register::operator-=()
PeepholeOptimizer.cpp - TargetInstrInfo::optimizeLoadInstr() takes an unsigned&
MachineTraceMetrics.cpp - MachineTraceMetrics lacks a suitable constructor
Manual fixups in:
ARMFastISel.cpp - ARMEmitLoad() now takes a Register& instead of unsigned&
HexagonSplitDouble.cpp - Ternary operator was ambiguous between unsigned/Register
HexagonConstExtenders.cpp - Has a local class named Register, used llvm::Register instead of Register.
PPCFastISel.cpp - PPCEmitLoad() now takes a Register& instead of unsigned&
Depends on D65919
Reviewers: arsenm, bogner, craig.topper, RKSimon
Reviewed By: arsenm
Subscribers: RKSimon, craig.topper, lenary, aemerson, wuzish, jholewinski, MatzeB, qcolombet, dschuff, jyknight, dylanmckay, sdardis, nemanjai, jvesely, wdng, nhaehnle, sbc100, jgravelle-google, kristof.beyls, hiraditya, aheejin, kbarton, fedor.sergeev, javed.absar, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, atanasyan, rogfer01, MartinMosbeck, brucehoult, the_o, tpr, PkmX, jocewei, jsji, Petar.Avramovic, asbirlea, Jim, s.egerton, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65962
llvm-svn: 369041
2019-08-15 21:22:08 +02:00
|
|
|
Register dst = MI.getOperand(0).getReg();
|
|
|
|
Register src = MI.getOperand(1).getReg();
|
2018-03-13 07:47:06 +01:00
|
|
|
|
bpf: fix wrong truncation elimination when there is back-edge/loop
Currently, BPF backend is doing truncation elimination. If one truncation
is performed on a value defined by narrow loads, then it could be redundant
given BPF loads zero extend the destination register implicitly.
When the definition of the truncated value is a merging value (PHI node)
that could come from different code paths, then checks need to be done on
all possible code paths.
Above described optimization was introduced as r306685, however it doesn't
work when there is back-edge, for example when loop is used inside BPF
code.
For example for the following code, a zero-extended value should be stored
into b[i], but the "and reg, 0xffff" is wrongly eliminated which then
generates corrupted data.
void cal1(unsigned short *a, unsigned long *b, unsigned int k)
{
unsigned short e;
e = *a;
for (unsigned int i = 0; i < k; i++) {
b[i] = e;
e = ~e;
}
}
The reason is r306685 was trying to do the PHI node checks inside isel
DAG2DAG phase, and the checks are done on MachineInstr. This is actually
wrong, because MachineInstr is being built during isel phase and the
associated information is not completed yet. A quick search shows none
target other than BPF is access MachineInstr info during isel phase.
For an PHI node, when you reached it during isel phase, it may have all
predecessors linked, but not successors. It seems successors are linked to
PHI node only when doing SelectionDAGISel::FinishBasicBlock and this
happens later than PreprocessISelDAG hook.
Previously, BPF program doesn't allow loop, there is probably the reason
why this bug was not exposed.
This patch therefore fixes the bug by the following approach:
- The existing truncation elimination code and the associated
"load_to_vreg_" records are removed.
- Instead, implement truncation elimination using MachineSSA pass, this
is where all information are built, and keep the pass together with other
similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move
elimination logic is updated accordingly.
- Unit testcase included + no compilation errors for kernel BPF selftest.
Patch Review
===
Patch was sent to and reviewed by BPF community at:
https://lore.kernel.org/bpf
Reported-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
llvm-svn: 375007
2019-10-16 17:27:59 +02:00
|
|
|
if (dst != src)
|
2018-03-13 07:47:06 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
ToErase = &MI;
|
|
|
|
RedundantMovElemNum++;
|
|
|
|
Eliminated = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Eliminated;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // end default namespace
|
|
|
|
|
|
|
|
INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",
|
|
|
|
"BPF PreEmit Peephole Optimization", false, false)
|
|
|
|
|
|
|
|
char BPFMIPreEmitPeephole::ID = 0;
|
|
|
|
FunctionPass* llvm::createBPFMIPreEmitPeepholePass()
|
|
|
|
{
|
|
|
|
return new BPFMIPreEmitPeephole();
|
|
|
|
}
|
bpf: fix wrong truncation elimination when there is back-edge/loop
Currently, BPF backend is doing truncation elimination. If one truncation
is performed on a value defined by narrow loads, then it could be redundant
given BPF loads zero extend the destination register implicitly.
When the definition of the truncated value is a merging value (PHI node)
that could come from different code paths, then checks need to be done on
all possible code paths.
Above described optimization was introduced as r306685, however it doesn't
work when there is back-edge, for example when loop is used inside BPF
code.
For example for the following code, a zero-extended value should be stored
into b[i], but the "and reg, 0xffff" is wrongly eliminated which then
generates corrupted data.
void cal1(unsigned short *a, unsigned long *b, unsigned int k)
{
unsigned short e;
e = *a;
for (unsigned int i = 0; i < k; i++) {
b[i] = e;
e = ~e;
}
}
The reason is r306685 was trying to do the PHI node checks inside isel
DAG2DAG phase, and the checks are done on MachineInstr. This is actually
wrong, because MachineInstr is being built during isel phase and the
associated information is not completed yet. A quick search shows none
target other than BPF is access MachineInstr info during isel phase.
For an PHI node, when you reached it during isel phase, it may have all
predecessors linked, but not successors. It seems successors are linked to
PHI node only when doing SelectionDAGISel::FinishBasicBlock and this
happens later than PreprocessISelDAG hook.
Previously, BPF program doesn't allow loop, there is probably the reason
why this bug was not exposed.
This patch therefore fixes the bug by the following approach:
- The existing truncation elimination code and the associated
"load_to_vreg_" records are removed.
- Instead, implement truncation elimination using MachineSSA pass, this
is where all information are built, and keep the pass together with other
similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move
elimination logic is updated accordingly.
- Unit testcase included + no compilation errors for kernel BPF selftest.
Patch Review
===
Patch was sent to and reviewed by BPF community at:
https://lore.kernel.org/bpf
Reported-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
llvm-svn: 375007
2019-10-16 17:27:59 +02:00
|
|
|
|
|
|
|
STATISTIC(TruncElemNum, "Number of truncation eliminated");
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
struct BPFMIPeepholeTruncElim : public MachineFunctionPass {
|
|
|
|
|
|
|
|
static char ID;
|
|
|
|
const BPFInstrInfo *TII;
|
|
|
|
MachineFunction *MF;
|
|
|
|
MachineRegisterInfo *MRI;
|
|
|
|
|
|
|
|
BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) {
|
|
|
|
initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Initialize class variables.
|
|
|
|
void initialize(MachineFunction &MFParm);
|
|
|
|
|
|
|
|
bool eliminateTruncSeq(void);
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
// Main entry point for this pass.
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
|
|
if (skipFunction(MF.getFunction()))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
initialize(MF);
|
|
|
|
|
|
|
|
return eliminateTruncSeq();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static bool TruncSizeCompatible(int TruncSize, unsigned opcode)
|
|
|
|
{
|
|
|
|
if (TruncSize == 1)
|
|
|
|
return opcode == BPF::LDB || opcode == BPF::LDB32;
|
|
|
|
|
|
|
|
if (TruncSize == 2)
|
|
|
|
return opcode == BPF::LDH || opcode == BPF::LDH32;
|
|
|
|
|
|
|
|
if (TruncSize == 4)
|
|
|
|
return opcode == BPF::LDW || opcode == BPF::LDW32;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initialize class variables.
|
|
|
|
void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) {
|
|
|
|
MF = &MFParm;
|
|
|
|
MRI = &MF->getRegInfo();
|
|
|
|
TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
|
|
|
|
LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reg truncating is often the result of 8/16/32bit->64bit or
|
|
|
|
// 8/16bit->32bit conversion. If the reg value is loaded with
|
|
|
|
// masked byte width, the AND operation can be removed since
|
|
|
|
// BPF LOAD already has zero extension.
|
|
|
|
//
|
|
|
|
// This also solved a correctness issue.
|
|
|
|
// In BPF socket-related program, e.g., __sk_buff->{data, data_end}
|
|
|
|
// are 32-bit registers, but later on, kernel verifier will rewrite
|
|
|
|
// it with 64-bit value. Therefore, truncating the value after the
|
|
|
|
// load will result in incorrect code.
|
|
|
|
bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
|
|
|
|
MachineInstr* ToErase = nullptr;
|
|
|
|
bool Eliminated = false;
|
|
|
|
|
|
|
|
for (MachineBasicBlock &MBB : *MF) {
|
|
|
|
for (MachineInstr &MI : MBB) {
|
|
|
|
// The second insn to remove if the eliminate candidate is a pair.
|
|
|
|
MachineInstr *MI2 = nullptr;
|
|
|
|
Register DstReg, SrcReg;
|
|
|
|
MachineInstr *DefMI;
|
|
|
|
int TruncSize = -1;
|
|
|
|
|
|
|
|
// If the previous instruction was marked for elimination, remove it now.
|
|
|
|
if (ToErase) {
|
|
|
|
ToErase->eraseFromParent();
|
|
|
|
ToErase = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate
|
|
|
|
// for BPF ANDI is i32, and this case only happens on ALU64.
|
|
|
|
if (MI.getOpcode() == BPF::SRL_ri &&
|
|
|
|
MI.getOperand(2).getImm() == 32) {
|
|
|
|
SrcReg = MI.getOperand(1).getReg();
|
BPF: Fix a bug in peephole TRUNC elimination optimization
Andrei Matei reported a llvm11 core dump for his bpf program
https://bugs.llvm.org/show_bug.cgi?id=48578
The core dump happens in LiveVariables analysis phase.
#4 0x00007fce54356bb0 __restore_rt
#5 0x00007fce4d51785e llvm::LiveVariables::HandleVirtRegUse(unsigned int,
llvm::MachineBasicBlock*, llvm::MachineInstr&)
#6 0x00007fce4d519abe llvm::LiveVariables::runOnInstr(llvm::MachineInstr&,
llvm::SmallVectorImpl<unsigned int>&)
#7 0x00007fce4d519ec6 llvm::LiveVariables::runOnBlock(llvm::MachineBasicBlock*, unsigned int)
#8 0x00007fce4d51a4bf llvm::LiveVariables::runOnMachineFunction(llvm::MachineFunction&)
The bug can be reproduced with llvm12 and latest trunk as well.
Futher analysis shows that there is a bug in BPF peephole
TRUNC elimination optimization, which tries to remove
unnecessary TRUNC operations (a <<= 32; a >>= 32).
Specifically, the compiler did wrong transformation for the
following patterns:
%1 = LDW ...
%2 = SLL_ri %1, 32
%3 = SRL_ri %2, 32
... %3 ...
%4 = SRA_ri %2, 32
... %4 ...
The current transformation did not check how many uses of %2
and did transformation like
%1 = LDW ...
... %1 ...
%4 = SRL_ri %2, 32
... %4 ...
and pseudo register %2 is used by not defined and
caused LiveVariables analysis core dump.
To fix the issue, when traversing back from SRL_ri to SLL_ri,
check to ensure SLL_ri has only one use. Otherwise, don't
do transformation.
Differential Revision: https://reviews.llvm.org/D97792
2021-03-02 18:35:21 +01:00
|
|
|
if (!MRI->hasOneNonDBGUse(SrcReg))
|
|
|
|
continue;
|
|
|
|
|
bpf: fix wrong truncation elimination when there is back-edge/loop
Currently, BPF backend is doing truncation elimination. If one truncation
is performed on a value defined by narrow loads, then it could be redundant
given BPF loads zero extend the destination register implicitly.
When the definition of the truncated value is a merging value (PHI node)
that could come from different code paths, then checks need to be done on
all possible code paths.
Above described optimization was introduced as r306685, however it doesn't
work when there is back-edge, for example when loop is used inside BPF
code.
For example for the following code, a zero-extended value should be stored
into b[i], but the "and reg, 0xffff" is wrongly eliminated which then
generates corrupted data.
void cal1(unsigned short *a, unsigned long *b, unsigned int k)
{
unsigned short e;
e = *a;
for (unsigned int i = 0; i < k; i++) {
b[i] = e;
e = ~e;
}
}
The reason is r306685 was trying to do the PHI node checks inside isel
DAG2DAG phase, and the checks are done on MachineInstr. This is actually
wrong, because MachineInstr is being built during isel phase and the
associated information is not completed yet. A quick search shows none
target other than BPF is access MachineInstr info during isel phase.
For an PHI node, when you reached it during isel phase, it may have all
predecessors linked, but not successors. It seems successors are linked to
PHI node only when doing SelectionDAGISel::FinishBasicBlock and this
happens later than PreprocessISelDAG hook.
Previously, BPF program doesn't allow loop, there is probably the reason
why this bug was not exposed.
This patch therefore fixes the bug by the following approach:
- The existing truncation elimination code and the associated
"load_to_vreg_" records are removed.
- Instead, implement truncation elimination using MachineSSA pass, this
is where all information are built, and keep the pass together with other
similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move
elimination logic is updated accordingly.
- Unit testcase included + no compilation errors for kernel BPF selftest.
Patch Review
===
Patch was sent to and reviewed by BPF community at:
https://lore.kernel.org/bpf
Reported-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
llvm-svn: 375007
2019-10-16 17:27:59 +02:00
|
|
|
MI2 = MRI->getVRegDef(SrcReg);
|
|
|
|
DstReg = MI.getOperand(0).getReg();
|
|
|
|
|
|
|
|
if (!MI2 ||
|
|
|
|
MI2->getOpcode() != BPF::SLL_ri ||
|
|
|
|
MI2->getOperand(2).getImm() != 32)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Update SrcReg.
|
|
|
|
SrcReg = MI2->getOperand(1).getReg();
|
|
|
|
DefMI = MRI->getVRegDef(SrcReg);
|
|
|
|
if (DefMI)
|
|
|
|
TruncSize = 4;
|
|
|
|
} else if (MI.getOpcode() == BPF::AND_ri ||
|
|
|
|
MI.getOpcode() == BPF::AND_ri_32) {
|
|
|
|
SrcReg = MI.getOperand(1).getReg();
|
|
|
|
DstReg = MI.getOperand(0).getReg();
|
|
|
|
DefMI = MRI->getVRegDef(SrcReg);
|
|
|
|
|
|
|
|
if (!DefMI)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
int64_t imm = MI.getOperand(2).getImm();
|
|
|
|
if (imm == 0xff)
|
|
|
|
TruncSize = 1;
|
|
|
|
else if (imm == 0xffff)
|
|
|
|
TruncSize = 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (TruncSize == -1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// The definition is PHI node, check all inputs.
|
|
|
|
if (DefMI->isPHI()) {
|
|
|
|
bool CheckFail = false;
|
|
|
|
|
|
|
|
for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
|
|
|
|
MachineOperand &opnd = DefMI->getOperand(i);
|
|
|
|
if (!opnd.isReg()) {
|
|
|
|
CheckFail = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
|
|
|
|
if (!PhiDef || PhiDef->isPHI() ||
|
|
|
|
!TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) {
|
|
|
|
CheckFail = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CheckFail)
|
|
|
|
continue;
|
|
|
|
} else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg)
|
|
|
|
.addReg(SrcReg);
|
|
|
|
|
|
|
|
if (MI2)
|
|
|
|
MI2->eraseFromParent();
|
|
|
|
|
|
|
|
// Mark it to ToErase, and erase in the next iteration.
|
|
|
|
ToErase = &MI;
|
|
|
|
TruncElemNum++;
|
|
|
|
Eliminated = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Eliminated;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // end default namespace
|
|
|
|
|
|
|
|
INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim",
|
|
|
|
"BPF MachineSSA Peephole Optimization For TRUNC Eliminate",
|
|
|
|
false, false)
|
|
|
|
|
|
|
|
char BPFMIPeepholeTruncElim::ID = 0;
|
|
|
|
FunctionPass* llvm::createBPFMIPeepholeTruncElimPass()
|
|
|
|
{
|
|
|
|
return new BPFMIPeepholeTruncElim();
|
|
|
|
}
|