From 11c2f5c876c3d406b4284a3a168b200d4b63a311 Mon Sep 17 00:00:00 2001 From: Joel Jones Date: Sat, 9 Dec 2017 23:59:55 +0000 Subject: [PATCH] [AArch64] Improve loop unrolling performance on Cavium T99 This patch improves performance on Cavium T99 as shown here (libquantum 0.2.4): https://docs.google.com/spreadsheets/d/1Lo1o2E1NjrpkwS7DvYYWsiVvPdd93h7KBaqeptMrZPY/edit?usp=sharing By increasing the LoopMicroOpsBufferSize in the Cavium T99 Scheduler file, loop unrolling becomes more aggressive. This helps performance on T99. Test case included. Patch by Stefan Teleman Differential Revision: https://reviews.llvm.org/D40695 llvm-svn: 320272 --- .../AArch64/AArch64SchedThunderX2T99.td | 2 +- .../AArch64/loop-micro-op-buffer-size-t99.ll | 124 ++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/AArch64/loop-micro-op-buffer-size-t99.ll diff --git a/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td index fd60459382a..22f272edd68 100644 --- a/lib/Target/AArch64/AArch64SchedThunderX2T99.td +++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -22,7 +22,7 @@ def ThunderX2T99Model : SchedMachineModel { let LoadLatency = 4; // Optimistic load latency. let MispredictPenalty = 12; // Extra cycles for mispredicted branch. // Determined via a mix of micro-arch details and experimentation. - let LoopMicroOpBufferSize = 32; + let LoopMicroOpBufferSize = 128; let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; diff --git a/test/CodeGen/AArch64/loop-micro-op-buffer-size-t99.ll b/test/CodeGen/AArch64/loop-micro-op-buffer-size-t99.ll new file mode 100644 index 00000000000..d64b51509e1 --- /dev/null +++ b/test/CodeGen/AArch64/loop-micro-op-buffer-size-t99.ll @@ -0,0 +1,124 @@ +; REQUIRES: asserts +; RUN: opt -mcpu=thunderx2t99 -loop-unroll --debug-only=loop-unroll -S -unroll-allow-partial < %s 2>&1 | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; CHECK: Loop Unroll: F[foo] Loop %loop.2.header +; CHECK: Loop Size = 19 +; CHECK: Trip Count = 512 +; CHECK: Trip Multiple = 512 +; CHECK: UNROLLING loop %loop.2.header by 4 with a breakout at trip 0 +; CHECK: Merging: +; CHECK: Loop Unroll: F[foo] Loop %loop.header +; CHECK: Loop Size = 18 +; CHECK: Trip Count = 512 +; CHECK: Trip Multiple = 512 +; CHECK: UNROLLING loop %loop.header by 4 with a breakout at trip 0 +; CHECK: Merging: +; CHECK: %counter = phi i32 [ 0, %entry ], [ %inc.3, %loop.inc.3 ] +; CHECK: %val = add nuw nsw i32 %counter, 5 +; CHECK: %val1 = add nuw nsw i32 %counter, 6 +; CHECK: %val2 = add nuw nsw i32 %counter, 7 +; CHECK: %val3 = add nuw nsw i32 %counter, 8 +; CHECK: %val4 = add nuw nsw i32 %counter, 9 +; CHECK: %val5 = add nuw nsw i32 %counter, 10 +; CHECK-NOT: %val = add i32 %counter, 5 +; CHECK-NOT: %val = add i32 %counter, 6 +; CHECK-NOT: %val = add i32 %counter, 7 +; CHECK-NOT: %val = add i32 %counter, 8 +; CHECK-NOT: %val = add i32 %counter, 9 +; CHECK-NOT: %val = add i32 %counter, 10 +; CHECK: %counter.2 = phi i32 [ 0, %exit.0 ], [ %inc.2.3, %loop.2.inc.3 ] + +define void @foo(i32 * %out) { +entry: + %0 = alloca [1024 x i32] + %x0 = alloca [1024 x i32] + %x01 = alloca [1024 x i32] + %x02 = alloca [1024 x i32] + %x03 = alloca [1024 x i32] + %x04 = alloca [1024 x i32] + %x05 = alloca [1024 x i32] + %x06 = alloca [1024 x i32] + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter + store i32 %counter, i32* %ptr + %val = add i32 %counter, 5 + %xptr = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter + store i32 %val, i32* %xptr + %val1 = add i32 %counter, 6 + %xptr1 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter + store i32 %val1, i32* %xptr1 + %val2 = add i32 %counter, 7 + %xptr2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter + store i32 %val2, i32* %xptr2 + %val3 = add i32 %counter, 8 + %xptr3 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter + store i32 %val3, i32* %xptr3 + %val4 = add i32 %counter, 9 + %xptr4 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter + store i32 %val4, i32* %xptr4 + %val5 = add i32 %counter, 10 + %xptr5 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter + store i32 %val5, i32* %xptr5 + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 2 + %1 = icmp sge i32 %inc, 1023 + br i1 %1, label %exit.0, label %loop.header + +exit.0: + %2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 5 + %3 = load i32, i32* %2 + store i32 %3, i32 * %out + br label %loop.2.header + + +loop.2.header: + %counter.2 = phi i32 [0, %exit.0], [%inc.2, %loop.2.inc] + br label %loop.2.body + +loop.2.body: + %ptr.2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter.2 + store i32 %counter.2, i32* %ptr.2 + %val.2 = add i32 %counter.2, 5 + %xptr.2 = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter.2 + store i32 %val.2, i32* %xptr.2 + %val1.2 = add i32 %counter.2, 6 + %xptr1.2 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter.2 + store i32 %val1, i32* %xptr1.2 + %val2.2 = add i32 %counter.2, 7 + %xptr2.2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter.2 + store i32 %val2, i32* %xptr2.2 + %val3.2 = add i32 %counter.2, 8 + %xptr3.2 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter.2 + store i32 %val3.2, i32* %xptr3.2 + %val4.2 = add i32 %counter.2, 9 + %xptr4.2 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter.2 + store i32 %val4.2, i32* %xptr4.2 + %val5.2 = add i32 %counter.2, 10 + %xptr5.2 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter.2 + store i32 %val5.2, i32* %xptr5.2 + %xptr6.2 = getelementptr [1024 x i32], [1024 x i32]* %x06, i32 0, i32 %counter.2 + store i32 %val5.2, i32* %xptr6.2 + br label %loop.2.inc + +loop.2.inc: + %inc.2 = add i32 %counter.2, 2 + %4 = icmp sge i32 %inc.2, 1023 + br i1 %4, label %exit.2, label %loop.2.header + +exit.2: + %x2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 6 + %x3 = load i32, i32* %x2 + %out2 = getelementptr i32, i32 * %out, i32 1 + store i32 %3, i32 * %out2 + ret void +}