From 0c2a21406a21c31d3abc872d725f2eadff9a4360 Mon Sep 17 00:00:00 2001
From: Vang Thao <vangth1995@gmail.com>
Date: Thu, 6 Aug 2020 20:46:27 -0700
Subject: [PATCH] [AMDGPU] Fix not rescheduling without clustering

Regions are sometimes skipped which should be rescheduled without memory op
clustering. RegionIdx is not incremented when iterating over regions that
are flagged to be skipped, causing the index to be incorrect.

Thanks to Vang Thao for discovering this bug!

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D85498
---
 lib/Target/AMDGPU/GCNSchedStrategy.cpp                |  4 +++-
 .../AMDGPU/schedule-regpressure-limit-clustering.ll   | 11 ++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index deed50b6db7..c2feb0ce25f 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -567,8 +567,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
       SavedMutations.swap(Mutations);
 
     for (auto Region : Regions) {
-      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) {
+        ++RegionIdx;
         continue;
+      }
 
       RegionBegin = Region.first;
       RegionEnd = Region.second;
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
index 884d0cbd4db..139669bbe6d 100644
--- a/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
+++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -3,6 +3,9 @@
 ; Interleave loads and stores to fit into 9 VGPR limit.
 ; This requires to avoid load/store clustering.
 
+; Reschedule the second scheduling region without clustering while
+; the first region is skipped.
+
 ; GCN: global_load_dwordx4
 ; GCN: global_store_dwordx4
 ; GCN: global_load_dwordx4
@@ -12,10 +15,13 @@
 ; GCN: NumVgprs: {{[0-9]$}}
 ; GCN: ScratchSize: 0{{$}}
 
-define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
+define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1, i1 %cnd) #1 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
+  br i1 %cnd, label %bb1, label %bb2
+
+bb1:
   %tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
   %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
   %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
@@ -27,6 +33,9 @@ bb:
   store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
   %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
   store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
+  br label %bb2
+
+bb2:
   ret void
 }