mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
1770a4b5f8
Currently, BPF backend is doing truncation elimination. If one truncation is performed on a value defined by narrow loads, then it could be redundant given BPF loads zero extend the destination register implicitly. When the definition of the truncated value is a merging value (PHI node) that could come from different code paths, then checks need to be done on all possible code paths. Above described optimization was introduced as r306685, however it doesn't work when there is back-edge, for example when loop is used inside BPF code. For example for the following code, a zero-extended value should be stored into b[i], but the "and reg, 0xffff" is wrongly eliminated which then generates corrupted data. void cal1(unsigned short *a, unsigned long *b, unsigned int k) { unsigned short e; e = *a; for (unsigned int i = 0; i < k; i++) { b[i] = e; e = ~e; } } The reason is r306685 was trying to do the PHI node checks inside isel DAG2DAG phase, and the checks are done on MachineInstr. This is actually wrong, because MachineInstr is being built during isel phase and the associated information is not completed yet. A quick search shows none target other than BPF is access MachineInstr info during isel phase. For an PHI node, when you reached it during isel phase, it may have all predecessors linked, but not successors. It seems successors are linked to PHI node only when doing SelectionDAGISel::FinishBasicBlock and this happens later than PreprocessISelDAG hook. Previously, BPF program doesn't allow loop, there is probably the reason why this bug was not exposed. This patch therefore fixes the bug by the following approach: - The existing truncation elimination code and the associated "load_to_vreg_" records are removed. - Instead, implement truncation elimination using MachineSSA pass, this is where all information are built, and keep the pass together with other similar peephole optimizations inside BPFMIPeephole.cpp. Redundant move elimination logic is updated accordingly. - Unit testcase included + no compilation errors for kernel BPF selftest. Patch Review === Patch was sent to and reviewed by BPF community at: https://lore.kernel.org/bpf Reported-by: David Beckett <david.beckett@netronome.com> Reviewed-by: Yonghong Song <yhs@fb.com> Signed-off-by: Jiong Wang <jiong.wang@netronome.com> llvm-svn: 375007
81 lines
2.8 KiB
LLVM
81 lines
2.8 KiB
LLVM
; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s
|
|
; RUN: llc < %s -march=bpf -mattr=+alu32 -verify-machineinstrs | FileCheck --check-prefix=CHECK-32 %s
|
|
;
|
|
; void cal1(unsigned short *a, unsigned long *b, unsigned int k)
|
|
; {
|
|
; unsigned short e;
|
|
;
|
|
; e = *a;
|
|
; for (unsigned int i = 0; i < k; i++) {
|
|
; b[i] = e;
|
|
; e = ~e;
|
|
; }
|
|
; }
|
|
;
|
|
; void cal2(unsigned short *a, unsigned int *b, unsigned int k)
|
|
; {
|
|
; unsigned short e;
|
|
;
|
|
; e = *a;
|
|
; for (unsigned int i = 0; i < k; i++) {
|
|
; b[i] = e;
|
|
; e = ~e;
|
|
; }
|
|
; }
|
|
|
|
; Function Attrs: nofree norecurse nounwind optsize
|
|
define dso_local void @cal1(i16* nocapture readonly %a, i64* nocapture %b, i32 %k) local_unnamed_addr #0 {
|
|
entry:
|
|
%cmp8 = icmp eq i32 %k, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%0 = load i16, i16* %a, align 2
|
|
%wide.trip.count = zext i32 %k to i64
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader
|
|
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ]
|
|
%conv = zext i16 %e.09 to i64
|
|
%arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv
|
|
; CHECK: r{{[0-9]+}} &= 65535
|
|
; CHECK-32: r{{[0-9]+}} &= 65535
|
|
store i64 %conv, i64* %arrayidx, align 8
|
|
%neg = xor i16 %e.09, -1
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
; Function Attrs: nofree norecurse nounwind optsize
|
|
define dso_local void @cal2(i16* nocapture readonly %a, i32* nocapture %b, i32 %k) local_unnamed_addr #0 {
|
|
entry:
|
|
%cmp8 = icmp eq i32 %k, 0
|
|
br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%0 = load i16, i16* %a, align 2
|
|
%wide.trip.count = zext i32 %k to i64
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %for.body, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body, %for.body.preheader
|
|
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ]
|
|
%conv = zext i16 %e.09 to i32
|
|
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
|
|
; CHECK: r{{[0-9]+}} &= 65535
|
|
; CHECK-32: w{{[0-9]+}} &= 65535
|
|
store i32 %conv, i32* %arrayidx, align 4
|
|
%neg = xor i16 %e.09, -1
|
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
|
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
|
|
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
|
}
|