From a33973f40346d2fab0edf0dc1e9403d376100c5d Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Mon, 7 Jan 2019 15:52:28 +0000
Subject: [PATCH] AMDGPU: test for uniformity of branch instruction, not its
 condition

Summary:
If a divergent branch instruction is marked as divergent by propagation
rule 2 in DivergencePropagator::exploreSyncDependency() and its condition
is uniform, that branch would incorrectly be assumed to be uniform.

Reviewers: arsenm, tstellar

Reviewed By: arsenm

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D56331

llvm-svn: 350532
---
 .../AMDGPU/AMDGPUAnnotateUniformValues.cpp    | 10 +-
 lib/Target/AMDGPU/SIAnnotateControlFlow.cpp   |  2 +-
 .../divergent-branch-uniform-condition.ll     | 97 +++++++++++++++++++
 3 files changed, 100 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 4e0cc736bad..f88e3b0dac8 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -117,14 +117,8 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
 }
 
 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
-  if (I.isUnconditional())
-    return;
-
-  Value *Cond = I.getCondition();
-  if (!DA->isUniform(Cond))
-    return;
-
-  setUniformMetadata(I.getParent()->getTerminator());
+  if (DA->isUniform(&I))
+    setUniformMetadata(I.getParent()->getTerminator());
 }
 
 void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 90f430d5ca4..98e9ea66232 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -155,7 +155,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 /// Is the branch condition uniform or did the StructurizeCFG pass
 /// consider it as such?
 bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
-  return DA->isUniform(T->getCondition()) ||
+  return DA->isUniform(T) ||
          T->getMetadata("structurizecfg.uniform") != nullptr;
 }
 
diff --git a/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
new file mode 100644
index 00000000000..8d21050ebee
--- /dev/null
+++ b/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; This module creates a divergent branch. The branch is marked as divergent by
+; the divergence analysis but the condition is not. This test ensures that the
+; divergence of the branch is tested, not its condition, so that branch is
+; correctly emitted as divergent.
+
+target triple = "amdgcn-mesa-mesa3d"
+
+define amdgpu_ps void @main(i32, float) {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %start
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    s_mov_b32 m0, s0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
+; CHECK-NEXT:    v_cmp_nlt_f32_e64 s[0:1], 0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; CHECK-NEXT:  BB0_1: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v1
+; CHECK-NEXT:    s_and_b64 vcc, exec, vcc
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    s_cbranch_vccz BB0_5
+; CHECK-NEXT:  ; %bb.2: ; %endif1
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_mov_b64 s[6:7], -1
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; CHECK-NEXT:    ; mask branch BB0_4
+; CHECK-NEXT:  BB0_3: ; %endif2
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT:    s_xor_b64 s[6:7], exec, -1
+; CHECK-NEXT:  BB0_4: ; %Flow1
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    s_branch BB0_6
+; CHECK-NEXT:  BB0_5: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:  BB0_6: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
+; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_cbranch_execnz BB0_1
+; CHECK-NEXT:  ; %bb.7: ; %Flow2
+; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; this is the divergent branch with the condition not marked as divergent
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; CHECK-NEXT:    ; mask branch BB0_9
+; CHECK-NEXT:  BB0_8: ; %if1
+; CHECK-NEXT:    v_sqrt_f32_e32 v1, v0
+; CHECK-NEXT:  BB0_9: ; %endloop
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
+; CHECK-NEXT:    s_endpgm
+start:
+  %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
+  br label %loop
+
+loop:
+  %v1 = phi i32 [ 0, %start ], [ %v5, %endif2 ]
+  %v2 = icmp ugt i32 %v1, 31
+  br i1 %v2, label %if1, label %endif1
+
+if1:
+  %v3 = call float @llvm.sqrt.f32(float %v0)
+  br label %endloop
+
+endif1:
+  %v4 = fcmp ogt float %v0, 0.000000e+00
+  br i1 %v4, label %endloop, label %endif2
+
+endif2:
+  %v5 = add i32 %v1, 1
+  br label %loop
+
+endloop:
+  %v6 = phi float [ %v3, %if1 ], [ 0.0, %endif1 ]
+  call void @llvm.amdgcn.exp.v4f32(i32 0, i32 15, float %v6, float %v6, float %v6, float %v6, i1 true, i1 true)
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.v4f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }