[NVPTX] Let there be One True Way to set NVVMReflect params.

Summary: Previously there were three ways to inform the NVVMReflect pass whether you wanted to flush denormals to zero: * An LLVM command-line option * Parameters to the NVVMReflect constructor * Metadata on the module itself. This change removes the first two, leaving only the third. The motivation for this change, aside from simplifying things, is that we want LLVM to be aware of whether it's operating in FTZ mode, so other passes can use this information. Ideally we'd have a target-generic piece of metadata on the module. This change moves us in that direction. Reviewers: tra Subscribers: jholewinski, llvm-commits Differential Revision: https://reviews.llvm.org/D28700 llvm-svn: 292068
2025-01-31 20:51:52 +01:00 · 2017-01-15 16:54:35 +00:00 · 2017-01-15 16:54:35 +00:00 · b814633873
commit b814633873
parent ea3cad6409
4 changed files with 66 additions and 111 deletions
--- a/docs/NVPTXUsage.rst
+++ b/docs/NVPTXUsage.rst
@ -289,7 +289,7 @@ code often follows a pattern:
      return my_function_precise(a);
  }

-The default value for all unspecified reflection parameters is zero. 
+The default value for all unspecified reflection parameters is zero.

 The ``NVVMReflect`` pass should be executed early in the optimization
 pipeline, immediately after the link stage. The ``internalize`` pass is also
@ -326,6 +326,18 @@ often leave behind dead code of the form:
 Therefore, it is recommended that ``NVVMReflect`` is executed early in the
 optimization pipeline before dead-code elimination.

+The NVPTX TargetMachine knows how to schedule ``NVVMReflect`` at the beginning
+of your pass manager; just use the following code when setting up your pass
+manager:
+
+.. code-block:: c++
+    std::unique_ptr<TargetMachine> TM = ...;
+    PassManagerBuilder PMBuilder(...);
+    PMBuilder.addExtension(
+        PassManagerBuilder::EP_EarlyAsPossible,
+        [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+          TM->addEarlyAsPossiblePasses(PM);
+        });

 Reflection Parameters
 ---------------------
@ -339,35 +351,16 @@ Flag                 Description
 ``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero
 ==================== ======================================================

+The value of this flag is determined by the "nvvm-reflect-ftz" module flag.
+The following sets the ftz flag to 1.

-Invoking NVVMReflect
--------------------
-
-To ensure that all dead code caused by the reflection pass is eliminated, it
-is recommended that the reflection pass is executed early in the LLVM IR
-optimization pipeline. The pass takes an optional mapping of reflection
-parameter name to an integer value. This mapping can be specified as either a
-command-line option to ``opt`` or as an LLVM ``StringMap<int>`` object when
-programmatically creating a pass pipeline.
-
-With ``opt``:
-
-.. code-block:: text
-
-  # opt -nvvm-reflect -nvvm-reflect-list=<var>=<value>,<var>=<value> module.bc -o module.reflect.bc
-
-
-With programmatic pass pipeline:
-
-.. code-block:: c++
-
-  extern FunctionPass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping);
-
-  StringMap<int> ReflectParams;
-  ReflectParams["__CUDA_FTZ"] = 1;
-  Passes.add(createNVVMReflectPass(ReflectParams));
-
+.. code-block:: llvm
+    !llvm.module.flag = !{!0}
+    !0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}

+(``i32 4`` indicates that the value set here overrides the value in another
+module we link with.  See the `LangRef <LangRef.html#module-flags-metadata>`
+for details.)

 Executing PTX
 =============
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@ -48,7 +48,6 @@ ModulePass *createGenericToNVVMPass();
 FunctionPass *createNVPTXInferAddressSpacesPass();
 FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
 FunctionPass *createNVVMReflectPass();
-FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@ -10,11 +10,10 @@
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
 // with an integer.
 //
-// We choose the value we use by looking, in this order, at:
-//
-//  * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
-//  * the StringMap passed to the pass's constructor, and
-//  * metadata in the module itself.
+// We choose the value we use by looking at metadata in the module itself.  Note
+// that we intentionally only have one way to choose these values, because other
+// parts of LLVM (particularly, InstCombineCall) rely on being able to predict
+// the values chosen by this pass.
 //
 // If we see an unknown string, we replace its call with 0.
 //
@ -49,30 +48,17 @@ namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }

 namespace {
 class NVVMReflect : public FunctionPass {
-private:
-  StringMap<int> VarMap;
-
 public:
  static char ID;
-  NVVMReflect() : NVVMReflect(StringMap<int>()) {}
-
-  NVVMReflect(const StringMap<int> &Mapping)
-      : FunctionPass(ID), VarMap(Mapping) {
+  NVVMReflect() : FunctionPass(ID) {
    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-    setVarMap();
  }

  bool runOnFunction(Function &) override;
-
-private:
-  void setVarMap();
 };
 }

 FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
-FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
-  return new NVVMReflect(Mapping);
-}

 static cl::opt<bool>
 NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
@ -83,35 +69,6 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                false)

-static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
-            cl::desc("A list of string=num assignments"),
-            cl::ValueRequired);
-
-/// The command line can look as follows :
-/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
-/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
-/// ReflectList vector. First, each of ReflectList[i] is 'split'
-/// using "," as the delimiter. Then each of this part is split
-/// using "=" as the delimiter.
-void NVVMReflect::setVarMap() {
-  for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
-    SmallVector<StringRef, 4> NameValList;
-    StringRef(ReflectList[i]).split(NameValList, ',');
-    for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
-      SmallVector<StringRef, 2> NameValPair;
-      NameValList[j].split(NameValPair, '=');
-      assert(NameValPair.size() == 2 && "name=val expected");
-      std::stringstream ValStream(NameValPair[1]);
-      int Val;
-      ValStream >> Val;
-      assert((!(ValStream.fail())) && "integer value expected");
-      VarMap[NameValPair[0]] = Val;
-    }
-  }
-}
-
 bool NVVMReflect::runOnFunction(Function &F) {
  if (!NVVMReflectEnabled)
    return false;
@ -199,11 +156,10 @@ bool NVVMReflect::runOnFunction(Function &F) {
    DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");

    int ReflectVal = 0; // The default value is 0
-    auto Iter = VarMap.find(ReflectArg);
-    if (Iter != VarMap.end())
-      ReflectVal = Iter->second;
-    else if (ReflectArg == "__CUDA_FTZ") {
-      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+    if (ReflectArg == "__CUDA_FTZ") {
+      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.  Our
+      // choice here must be kept in sync with AutoUpgrade, which uses the same
+      // technique to detect whether ftz is enabled.
      if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
              F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
        ReflectVal = Flag->getSExtValue();
--- a/test/CodeGen/NVPTX/nvvm-reflect.ll
+++ b/test/CodeGen/NVPTX/nvvm-reflect.ll
@ -1,30 +1,38 @@
-; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=0 -O2 | FileCheck %s --check-prefix=USE_MUL_0
-; RUN: opt < %s -S -nvvm-reflect -nvvm-reflect-list USE_MUL=1 -O2 | FileCheck %s --check-prefix=USE_MUL_1
+; We run nvvm-reflect (and then optimize) this module twice, once with metadata
+; that enables FTZ, and again with metadata that disables it.

-@str = private unnamed_addr addrspace(4) constant [8 x i8] c"USE_MUL\00"
+; RUN: cat %s > %t.noftz
+; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz
+; RUN: opt %t.noftz -S -nvvm-reflect -O2 \
+; RUN:   | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK
+
+; RUN: cat %s > %t.ftz
+; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz
+; RUN: opt %t.ftz -S -nvvm-reflect -O2 \
+; RUN:   | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK
+
+@str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00"

 declare i32 @__nvvm_reflect(i8*)
 declare i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)*)

+; CHECK-LABEL: @foo
 define float @foo(float %a, float %b) {
-; USE_MUL_0: define float @foo
-; USE_MUL_0-NOT: call i32 @__nvvm_reflect
-; USE_MUL_1: define float @foo
-; USE_MUL_1-NOT: call i32 @__nvvm_reflect
-  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
+; CHECK-NOT: call i32 @__nvvm_reflect
+  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0))
  %reflect = tail call i32 @__nvvm_reflect(i8* %ptr)
  %cmp = icmp ugt i32 %reflect, 0
  br i1 %cmp, label %use_mul, label %use_add

 use_mul:
-; USE_MUL_1: fmul float %a, %b
-; USE_MUL_0-NOT: fadd float %a, %b
+; USE_FTZ_1: fmul float %a, %b
+; USE_FTZ_0-NOT: fadd float %a, %b
  %ret1 = fmul float %a, %b
  br label %exit

 use_add:
-; USE_MUL_0: fadd float %a, %b
-; USE_MUL_1-NOT: fmul float %a, %b
+; USE_FTZ_0: fadd float %a, %b
+; USE_FTZ_1-NOT: fmul float %a, %b
  %ret2 = fadd float %a, %b
  br label %exit

@ -35,14 +43,12 @@ exit:

 declare i32 @llvm.nvvm.reflect.p0i8(i8*)

-; USE_MUL_0: define i32 @intrinsic
-; USE_MUL_1: define i32 @intrinsic
+; CHECK-LABEL: define i32 @intrinsic
 define i32 @intrinsic() {
-; USE_MUL_0-NOT: call i32 @llvm.nvvm.reflect
-; USE_MUL_0: ret i32 0
-; USE_MUL_1-NOT: call i32 @llvm.nvvm.reflect
-; USE_MUL_1: ret i32 1
-  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @str, i32 0, i32 0))
+; CHECK-NOT: call i32 @llvm.nvvm.reflect
+; USE_FTZ_0: ret i32 0
+; USE_FTZ_1: ret i32 1
+  %ptr = tail call i8* @llvm.nvvm.ptr.constant.to.gen.p0i8.p4i8(i8 addrspace(4)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(4)* @str, i32 0, i32 0))
  %reflect = tail call i32 @llvm.nvvm.reflect.p0i8(i8* %ptr)
  ret i32 %reflect
 }
@ -50,26 +56,24 @@ define i32 @intrinsic() {
 ; CUDA-7.0 passes __nvvm_reflect argument slightly differently.
 ; Verify that it works, too

-@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00"
+@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"

+; CHECK-LABEL: @bar
 define float @bar(float %a, float %b) {
-; USE_MUL_0: define float @bar
-; USE_MUL_0-NOT: call i32 @__nvvm_reflect
-; USE_MUL_1: define float @bar
-; USE_MUL_1-NOT: call i32 @__nvvm_reflect
-  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+; CHECK-NOT: call i32 @__nvvm_reflect
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
  %cmp = icmp ne i32 %reflect, 0
  br i1 %cmp, label %use_mul, label %use_add

 use_mul:
-; USE_MUL_1: fmul float %a, %b
-; USE_MUL_0-NOT: fadd float %a, %b
+; USE_FTZ_1: fmul float %a, %b
+; USE_FTZ_0-NOT: fadd float %a, %b
  %ret1 = fmul float %a, %b
  br label %exit

 use_add:
-; USE_MUL_0: fadd float %a, %b
-; USE_MUL_1-NOT: fmul float %a, %b
+; USE_FTZ_0: fadd float %a, %b
+; USE_FTZ_1-NOT: fmul float %a, %b
  %ret2 = fadd float %a, %b
  br label %exit

@ -77,3 +81,6 @@ exit:
  %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
  ret float %ret
 }
+
+!llvm.module.flags = !{!0}
+; A module flag is added to the end of this file by the RUN lines at the top.