AMDGPU: test for uniformity of branch instruction, not its condition

Summary: If a divergent branch instruction is marked as divergent by propagation rule 2 in DivergencePropagator::exploreSyncDependency() and its condition is uniform, that branch would incorrectly be assumed to be uniform. Reviewers: arsenm, tstellar Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D56331 llvm-svn: 350532
2024-11-22 18:54:02 +01:00 · 2019-01-07 15:52:28 +00:00 · 2019-01-07 15:52:28 +00:00 · a33973f403
commit a33973f403
parent 81bc305168
3 changed files with 100 additions and 9 deletions
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@ -117,13 +117,7 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
 }

 void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
-  if (I.isUnconditional())
-    return;
-
-  Value *Cond = I.getCondition();
-  if (!DA->isUniform(Cond))
-    return;
-
+  if (DA->isUniform(&I))
    setUniformMetadata(I.getParent()->getTerminator());
 }

--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@ -155,7 +155,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 /// Is the branch condition uniform or did the StructurizeCFG pass
 /// consider it as such?
 bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
-  return DA->isUniform(T->getCondition()) ||
+  return DA->isUniform(T) ||
         T->getMetadata("structurizecfg.uniform") != nullptr;
 }

--- a/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; This module creates a divergent branch. The branch is marked as divergent by
+; the divergence analysis but the condition is not. This test ensures that the
+; divergence of the branch is tested, not its condition, so that branch is
+; correctly emitted as divergent.
+
+target triple = "amdgcn-mesa-mesa3d"
+
+define amdgpu_ps void @main(i32, float) {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %start
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    s_mov_b32 m0, s0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
+; CHECK-NEXT:    v_cmp_nlt_f32_e64 s[0:1], 0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; CHECK-NEXT:  BB0_1: ; %loop
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v1
+; CHECK-NEXT:    s_and_b64 vcc, exec, vcc
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], exec
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    s_cbranch_vccz BB0_5
+; CHECK-NEXT:  ; %bb.2: ; %endif1
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_mov_b64 s[6:7], -1
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; CHECK-NEXT:    ; mask branch BB0_4
+; CHECK-NEXT:  BB0_3: ; %endif2
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT:    s_xor_b64 s[6:7], exec, -1
+; CHECK-NEXT:  BB0_4: ; %Flow1
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    s_branch BB0_6
+; CHECK-NEXT:  BB0_5: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:  BB0_6: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
+; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_cbranch_execnz BB0_1
+; CHECK-NEXT:  ; %bb.7: ; %Flow2
+; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; this is the divergent branch with the condition not marked as divergent
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; CHECK-NEXT:    ; mask branch BB0_9
+; CHECK-NEXT:  BB0_8: ; %if1
+; CHECK-NEXT:    v_sqrt_f32_e32 v1, v0
+; CHECK-NEXT:  BB0_9: ; %endloop
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
+; CHECK-NEXT:    s_endpgm
+start:
+  %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
+  br label %loop
+
+loop:
+  %v1 = phi i32 [ 0, %start ], [ %v5, %endif2 ]
+  %v2 = icmp ugt i32 %v1, 31
+  br i1 %v2, label %if1, label %endif1
+
+if1:
+  %v3 = call float @llvm.sqrt.f32(float %v0)
+  br label %endloop
+
+endif1:
+  %v4 = fcmp ogt float %v0, 0.000000e+00
+  br i1 %v4, label %endloop, label %endif2
+
+endif2:
+  %v5 = add i32 %v1, 1
+  br label %loop
+
+endloop:
+  %v6 = phi float [ %v3, %if1 ], [ 0.0, %endif1 ]
+  call void @llvm.amdgcn.exp.v4f32(i32 0, i32 15, float %v6, float %v6, float %v6, float %v6, i1 true, i1 true)
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.v4f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }