This patch adds a new NVPTX back-end to LLVM which supports code generation for NVIDIA PTX 3.0. This back-end will (eventually) replace the current PTX back-end, while maintaining compatibility with it.

The new target machines are: nvptx (old ptx32) => 32-bit PTX nvptx64 (old ptx64) => 64-bit PTX The sources are based on the internal NVIDIA NVPTX back-end, and contain more functionality than the current PTX back-end currently provides. NV_CONTRIB llvm-svn: 156196
2025-01-31 12:41:49 +01:00 · 2012-05-04 20:18:50 +00:00 · 2012-05-04 20:18:50 +00:00 · 4ca961430f
commit 4ca961430f
parent 2b868d474e
86 changed files with 25901 additions and 11 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -78,6 +78,7 @@ set(LLVM_ALL_TARGETS
  Mips
  MBlaze
  MSP430
+  NVPTX
  PowerPC
  PTX
  Sparc
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@ -370,6 +370,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
  hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
  mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
  *)                      llvm_cv_target_arch="Unknown" ;;
 esac])

@ -517,6 +518,7 @@ else
    Hexagon)     AC_SUBST(TARGET_HAS_JIT,0) ;;
    MBlaze)      AC_SUBST(TARGET_HAS_JIT,0) ;;
    PTX)         AC_SUBST(TARGET_HAS_JIT,0) ;;
+    NVPTX)       AC_SUBST(TARGET_HAS_JIT,0) ;;
    *)           AC_SUBST(TARGET_HAS_JIT,0) ;;
  esac
 fi
@ -628,13 +630,13 @@ TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
    [Build specific host targets: all or target1,target2,... Valid targets are:
     host, x86, x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-     xcore, msp430, ptx, and cpp (default=all)]),,
+     xcore, msp430, ptx, nvptx, and cpp (default=all)]),,
    enableval=all)
 if test "$enableval" = host-only ; then
  enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze PTX Hexagon" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze PTX NVPTX Hexagon" ;;
  *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
      case "$a_target" in
        x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -651,6 +653,7 @@ case "$enableval" in
        hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
        mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
        host) case "$llvm_cv_target_arch" in
            x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
            x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -664,6 +667,7 @@ case "$enableval" in
            MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
            Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
            *)       AC_MSG_ERROR([Can not set target to build]) ;;
          esac ;;
        *) AC_MSG_ERROR([Unrecognized target $a_target]) ;;
--- a/11
+++ b/11
@ -1420,7 +1420,7 @@ Optional Features:
  --enable-targets        Build specific host targets: all or
                          target1,target2,... Valid targets are: host, x86,
                          x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-                          xcore, msp430, ptx, and cpp (default=all)
+                          xcore, msp430, ptx, nvptx, and cpp (default=all)
  --enable-bindings       Build specific language bindings:
                          all,auto,none,{binding-name} (default=auto)
  --enable-libffi         Check for the presence of libffi (default is NO)
@ -3903,6 +3903,7 @@ else
  hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
  mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
  *)                      llvm_cv_target_arch="Unknown" ;;
 esac
 fi
@ -5125,6 +5126,8 @@ else
    MBlaze)      TARGET_HAS_JIT=0
 ;;
    PTX)         TARGET_HAS_JIT=0
+ ;;
+    NVPTX)       TARGET_HAS_JIT=0
 ;;
    *)           TARGET_HAS_JIT=0
 ;;
@ -5310,7 +5313,7 @@ if test "$enableval" = host-only ; then
  enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze PTX Hexagon" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 CppBackend MBlaze PTX NVPTX Hexagon" ;;
  *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
      case "$a_target" in
        x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -5327,6 +5330,7 @@ case "$enableval" in
        hexagon)  TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
        mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
        host) case "$llvm_cv_target_arch" in
            x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
            x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -5340,6 +5344,7 @@ case "$enableval" in
            MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
            Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
            *)       { { echo "$as_me:$LINENO: error: Can not set target to build" >&5
 echo "$as_me: error: Can not set target to build" >&2;}
   { (exit 1); exit 1; }; } ;;
@ -10401,7 +10406,7 @@ else
  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  lt_status=$lt_dlunknown
  cat > conftest.$ac_ext <<EOF
-#line 10404 "configure"
+#line 10409 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@ -64,6 +64,8 @@ public:
    mblaze,  // MBlaze: mblaze
    ptx32,   // PTX: ptx (32-bit)
    ptx64,   // PTX: ptx (64-bit)
+    nvptx,   // NVPTX: 32-bit
+    nvptx64, // NVPTX: 64-bit
    le32,    // le32: generic little-endian 32-bit CPU (PNaCl / Emscripten)
    amdil   // amdil: amd IL
  };
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@ -441,3 +441,4 @@ include "llvm/IntrinsicsCellSPU.td"
 include "llvm/IntrinsicsXCore.td"
 include "llvm/IntrinsicsPTX.td"
 include "llvm/IntrinsicsHexagon.td"
+include "llvm/IntrinsicsNVVM.td"
--- a/include/llvm/IntrinsicsNVVM.td
+++ b/include/llvm/IntrinsicsNVVM.td
@ -0,0 +1,872 @@
+//===- IntrinsicsNVVM.td - Defines NVVM intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the NVVM-specific intrinsics for use with NVPTX.
+//
+//===----------------------------------------------------------------------===//
+
+def llvm_anyi64ptr_ty     : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
+
+//
+// MISC
+//
+
+  def int_nvvm_clz_i : GCCBuiltin<"__nvvm_clz_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_clz_ll : GCCBuiltin<"__nvvm_clz_ll">,
+      Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_popc_i : GCCBuiltin<"__nvvm_popc_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_popc_ll : GCCBuiltin<"__nvvm_popc_ll">,
+      Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Min Max
+//
+
+  def int_nvvm_min_i : GCCBuiltin<"__nvvm_min_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_min_ui : GCCBuiltin<"__nvvm_min_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_min_ll : GCCBuiltin<"__nvvm_min_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_min_ull : GCCBuiltin<"__nvvm_min_ull">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_max_i : GCCBuiltin<"__nvvm_max_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_max_ui : GCCBuiltin<"__nvvm_max_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_max_ll : GCCBuiltin<"__nvvm_max_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_max_ull : GCCBuiltin<"__nvvm_max_ull">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fmin_f : GCCBuiltin<"__nvvm_fmin_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fmin_ftz_f : GCCBuiltin<"__nvvm_fmin_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fmax_f : GCCBuiltin<"__nvvm_fmax_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]
+        , [IntrNoMem, Commutative]>;
+  def int_nvvm_fmax_ftz_f : GCCBuiltin<"__nvvm_fmax_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fmin_d : GCCBuiltin<"__nvvm_fmin_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fmax_d : GCCBuiltin<"__nvvm_fmax_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Multiplication
+//
+
+  def int_nvvm_mulhi_i : GCCBuiltin<"__nvvm_mulhi_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mulhi_ui : GCCBuiltin<"__nvvm_mulhi_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mulhi_ll : GCCBuiltin<"__nvvm_mulhi_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mulhi_ull : GCCBuiltin<"__nvvm_mulhi_ull">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mul_rn_ftz_f : GCCBuiltin<"__nvvm_mul_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rn_f : GCCBuiltin<"__nvvm_mul_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rz_ftz_f : GCCBuiltin<"__nvvm_mul_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rz_f : GCCBuiltin<"__nvvm_mul_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rm_ftz_f : GCCBuiltin<"__nvvm_mul_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rm_f : GCCBuiltin<"__nvvm_mul_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rp_ftz_f : GCCBuiltin<"__nvvm_mul_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rp_f : GCCBuiltin<"__nvvm_mul_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mul_rn_d : GCCBuiltin<"__nvvm_mul_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rz_d : GCCBuiltin<"__nvvm_mul_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rm_d : GCCBuiltin<"__nvvm_mul_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul_rp_d : GCCBuiltin<"__nvvm_mul_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_mul24_i : GCCBuiltin<"__nvvm_mul24_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_mul24_ui : GCCBuiltin<"__nvvm_mul24_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Div
+//
+
+  def int_nvvm_div_approx_ftz_f : GCCBuiltin<"__nvvm_div_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_approx_f : GCCBuiltin<"__nvvm_div_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rn_ftz_f : GCCBuiltin<"__nvvm_div_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rn_f : GCCBuiltin<"__nvvm_div_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rz_ftz_f : GCCBuiltin<"__nvvm_div_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rz_f : GCCBuiltin<"__nvvm_div_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rm_ftz_f : GCCBuiltin<"__nvvm_div_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rm_f : GCCBuiltin<"__nvvm_div_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rp_ftz_f : GCCBuiltin<"__nvvm_div_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rp_f : GCCBuiltin<"__nvvm_div_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_div_rn_d : GCCBuiltin<"__nvvm_div_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rz_d : GCCBuiltin<"__nvvm_div_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rm_d : GCCBuiltin<"__nvvm_div_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_div_rp_d : GCCBuiltin<"__nvvm_div_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Brev
+//
+
+  def int_nvvm_brev32 : GCCBuiltin<"__nvvm_brev32">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_brev64 : GCCBuiltin<"__nvvm_brev64">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+//
+// Sad
+//
+
+  def int_nvvm_sad_i : GCCBuiltin<"__nvvm_sad_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_sad_ui : GCCBuiltin<"__nvvm_sad_ui">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Floor  Ceil
+//
+
+  def int_nvvm_floor_ftz_f : GCCBuiltin<"__nvvm_floor_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_floor_f : GCCBuiltin<"__nvvm_floor_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_floor_d : GCCBuiltin<"__nvvm_floor_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_ceil_ftz_f : GCCBuiltin<"__nvvm_ceil_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ceil_f : GCCBuiltin<"__nvvm_ceil_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ceil_d : GCCBuiltin<"__nvvm_ceil_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Abs
+//
+
+  def int_nvvm_abs_i : GCCBuiltin<"__nvvm_abs_i">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_abs_ll : GCCBuiltin<"__nvvm_abs_ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_fabs_ftz_f : GCCBuiltin<"__nvvm_fabs_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_fabs_f : GCCBuiltin<"__nvvm_fabs_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Round
+//
+
+  def int_nvvm_round_ftz_f : GCCBuiltin<"__nvvm_round_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_round_f : GCCBuiltin<"__nvvm_round_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_round_d : GCCBuiltin<"__nvvm_round_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Trunc
+//
+
+  def int_nvvm_trunc_ftz_f : GCCBuiltin<"__nvvm_trunc_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_trunc_f : GCCBuiltin<"__nvvm_trunc_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_trunc_d : GCCBuiltin<"__nvvm_trunc_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Saturate
+//
+
+  def int_nvvm_saturate_ftz_f : GCCBuiltin<"__nvvm_saturate_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_saturate_f : GCCBuiltin<"__nvvm_saturate_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_saturate_d : GCCBuiltin<"__nvvm_saturate_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Exp2  Log2
+//
+
+  def int_nvvm_ex2_approx_ftz_f : GCCBuiltin<"__nvvm_ex2_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_f : GCCBuiltin<"__nvvm_ex2_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_ex2_approx_d : GCCBuiltin<"__nvvm_ex2_approx_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_lg2_approx_ftz_f : GCCBuiltin<"__nvvm_lg2_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_lg2_approx_f : GCCBuiltin<"__nvvm_lg2_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_lg2_approx_d : GCCBuiltin<"__nvvm_lg2_approx_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Sin  Cos
+//
+
+  def int_nvvm_sin_approx_ftz_f : GCCBuiltin<"__nvvm_sin_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sin_approx_f : GCCBuiltin<"__nvvm_sin_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_cos_approx_ftz_f : GCCBuiltin<"__nvvm_cos_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+//
+// Fma
+//
+
+  def int_nvvm_fma_rn_ftz_f : GCCBuiltin<"__nvvm_fma_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rn_f : GCCBuiltin<"__nvvm_fma_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rz_ftz_f : GCCBuiltin<"__nvvm_fma_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rz_f : GCCBuiltin<"__nvvm_fma_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rm_ftz_f : GCCBuiltin<"__nvvm_fma_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rm_f : GCCBuiltin<"__nvvm_fma_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rp_ftz_f : GCCBuiltin<"__nvvm_fma_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rp_f : GCCBuiltin<"__nvvm_fma_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_fma_rn_d : GCCBuiltin<"__nvvm_fma_rn_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rz_d : GCCBuiltin<"__nvvm_fma_rz_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rm_d : GCCBuiltin<"__nvvm_fma_rm_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_fma_rp_d : GCCBuiltin<"__nvvm_fma_rp_d">,
+      Intrinsic<[llvm_double_ty],
+        [llvm_double_ty, llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Rcp
+//
+
+  def int_nvvm_rcp_rn_ftz_f : GCCBuiltin<"__nvvm_rcp_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rn_f : GCCBuiltin<"__nvvm_rcp_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rz_ftz_f : GCCBuiltin<"__nvvm_rcp_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rz_f : GCCBuiltin<"__nvvm_rcp_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rm_ftz_f : GCCBuiltin<"__nvvm_rcp_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rm_f : GCCBuiltin<"__nvvm_rcp_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rp_ftz_f : GCCBuiltin<"__nvvm_rcp_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rp_f : GCCBuiltin<"__nvvm_rcp_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_rcp_rn_d : GCCBuiltin<"__nvvm_rcp_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rz_d : GCCBuiltin<"__nvvm_rcp_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rm_d : GCCBuiltin<"__nvvm_rcp_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_rcp_rp_d : GCCBuiltin<"__nvvm_rcp_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_rcp_approx_ftz_d : GCCBuiltin<"__nvvm_rcp_approx_ftz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Sqrt
+//
+
+  def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rz_ftz_f : GCCBuiltin<"__nvvm_sqrt_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rz_f : GCCBuiltin<"__nvvm_sqrt_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rm_ftz_f : GCCBuiltin<"__nvvm_sqrt_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rm_f : GCCBuiltin<"__nvvm_sqrt_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rp_ftz_f : GCCBuiltin<"__nvvm_sqrt_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rp_f : GCCBuiltin<"__nvvm_sqrt_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_approx_ftz_f : GCCBuiltin<"__nvvm_sqrt_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_approx_f : GCCBuiltin<"__nvvm_sqrt_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_sqrt_rn_d : GCCBuiltin<"__nvvm_sqrt_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rz_d : GCCBuiltin<"__nvvm_sqrt_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rm_d : GCCBuiltin<"__nvvm_sqrt_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_sqrt_rp_d : GCCBuiltin<"__nvvm_sqrt_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Rsqrt
+//
+
+  def int_nvvm_rsqrt_approx_ftz_f : GCCBuiltin<"__nvvm_rsqrt_approx_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rsqrt_approx_f : GCCBuiltin<"__nvvm_rsqrt_approx_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_rsqrt_approx_d : GCCBuiltin<"__nvvm_rsqrt_approx_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+
+//
+// Add
+//
+
+  def int_nvvm_add_rn_ftz_f : GCCBuiltin<"__nvvm_add_rn_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rn_f : GCCBuiltin<"__nvvm_add_rn_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rz_ftz_f : GCCBuiltin<"__nvvm_add_rz_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rz_f : GCCBuiltin<"__nvvm_add_rz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rm_ftz_f : GCCBuiltin<"__nvvm_add_rm_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rm_f : GCCBuiltin<"__nvvm_add_rm_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rp_ftz_f : GCCBuiltin<"__nvvm_add_rp_ftz_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rp_f : GCCBuiltin<"__nvvm_add_rp_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_add_rn_d : GCCBuiltin<"__nvvm_add_rn_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rz_d : GCCBuiltin<"__nvvm_add_rz_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rm_d : GCCBuiltin<"__nvvm_add_rm_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+  def int_nvvm_add_rp_d : GCCBuiltin<"__nvvm_add_rp_d">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+        [IntrNoMem, Commutative]>;
+
+//
+// Convert
+//
+
+  def int_nvvm_d2f_rn_ftz : GCCBuiltin<"__nvvm_d2f_rn_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rn : GCCBuiltin<"__nvvm_d2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rz_ftz : GCCBuiltin<"__nvvm_d2f_rz_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rz : GCCBuiltin<"__nvvm_d2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rm_ftz : GCCBuiltin<"__nvvm_d2f_rm_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rm : GCCBuiltin<"__nvvm_d2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rp_ftz : GCCBuiltin<"__nvvm_d2f_rp_ftz">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2f_rp : GCCBuiltin<"__nvvm_d2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2i_rn : GCCBuiltin<"__nvvm_d2i_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_rz : GCCBuiltin<"__nvvm_d2i_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_rm : GCCBuiltin<"__nvvm_d2i_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_rp : GCCBuiltin<"__nvvm_d2i_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2ui_rn : GCCBuiltin<"__nvvm_d2ui_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ui_rz : GCCBuiltin<"__nvvm_d2ui_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ui_rm : GCCBuiltin<"__nvvm_d2ui_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ui_rp : GCCBuiltin<"__nvvm_d2ui_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_i2d_rn : GCCBuiltin<"__nvvm_i2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2d_rz : GCCBuiltin<"__nvvm_i2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2d_rm : GCCBuiltin<"__nvvm_i2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2d_rp : GCCBuiltin<"__nvvm_i2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_ui2d_rn : GCCBuiltin<"__nvvm_ui2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2d_rz : GCCBuiltin<"__nvvm_ui2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2d_rm : GCCBuiltin<"__nvvm_ui2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2d_rp : GCCBuiltin<"__nvvm_ui2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2i_rn_ftz : GCCBuiltin<"__nvvm_f2i_rn_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rn : GCCBuiltin<"__nvvm_f2i_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rz_ftz : GCCBuiltin<"__nvvm_f2i_rz_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rz : GCCBuiltin<"__nvvm_f2i_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rm_ftz : GCCBuiltin<"__nvvm_f2i_rm_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rm : GCCBuiltin<"__nvvm_f2i_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rp_ftz : GCCBuiltin<"__nvvm_f2i_rp_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2i_rp : GCCBuiltin<"__nvvm_f2i_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2ui_rn_ftz : GCCBuiltin<"__nvvm_f2ui_rn_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rn : GCCBuiltin<"__nvvm_f2ui_rn">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rz_ftz : GCCBuiltin<"__nvvm_f2ui_rz_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rz : GCCBuiltin<"__nvvm_f2ui_rz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rm_ftz : GCCBuiltin<"__nvvm_f2ui_rm_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rm : GCCBuiltin<"__nvvm_f2ui_rm">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rp_ftz : GCCBuiltin<"__nvvm_f2ui_rp_ftz">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ui_rp : GCCBuiltin<"__nvvm_f2ui_rp">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_i2f_rn : GCCBuiltin<"__nvvm_i2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2f_rz : GCCBuiltin<"__nvvm_i2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2f_rm : GCCBuiltin<"__nvvm_i2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_i2f_rp : GCCBuiltin<"__nvvm_i2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_ui2f_rn : GCCBuiltin<"__nvvm_ui2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2f_rz : GCCBuiltin<"__nvvm_ui2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2f_rm : GCCBuiltin<"__nvvm_ui2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_nvvm_ui2f_rp : GCCBuiltin<"__nvvm_ui2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_lohi_i2d : GCCBuiltin<"__nvvm_lohi_i2d">,
+      Intrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, Commutative]>;
+
+  def int_nvvm_d2i_lo : GCCBuiltin<"__nvvm_d2i_lo">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2i_hi : GCCBuiltin<"__nvvm_d2i_hi">,
+      Intrinsic<[llvm_i32_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2ll_rn_ftz : GCCBuiltin<"__nvvm_f2ll_rn_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rn : GCCBuiltin<"__nvvm_f2ll_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rz_ftz : GCCBuiltin<"__nvvm_f2ll_rz_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rz : GCCBuiltin<"__nvvm_f2ll_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rm_ftz : GCCBuiltin<"__nvvm_f2ll_rm_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rm : GCCBuiltin<"__nvvm_f2ll_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rp_ftz : GCCBuiltin<"__nvvm_f2ll_rp_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ll_rp : GCCBuiltin<"__nvvm_f2ll_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2ull_rn_ftz : GCCBuiltin<"__nvvm_f2ull_rn_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rn : GCCBuiltin<"__nvvm_f2ull_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rz_ftz : GCCBuiltin<"__nvvm_f2ull_rz_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rz : GCCBuiltin<"__nvvm_f2ull_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rm_ftz : GCCBuiltin<"__nvvm_f2ull_rm_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rm : GCCBuiltin<"__nvvm_f2ull_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rp_ftz : GCCBuiltin<"__nvvm_f2ull_rp_ftz">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2ull_rp : GCCBuiltin<"__nvvm_f2ull_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2ll_rn : GCCBuiltin<"__nvvm_d2ll_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ll_rz : GCCBuiltin<"__nvvm_d2ll_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ll_rm : GCCBuiltin<"__nvvm_d2ll_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ll_rp : GCCBuiltin<"__nvvm_d2ll_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_d2ull_rn : GCCBuiltin<"__nvvm_d2ull_rn">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ull_rz : GCCBuiltin<"__nvvm_d2ull_rz">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ull_rm : GCCBuiltin<"__nvvm_d2ull_rm">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_nvvm_d2ull_rp : GCCBuiltin<"__nvvm_d2ull_rp">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+
+  def int_nvvm_ll2f_rn : GCCBuiltin<"__nvvm_ll2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2f_rz : GCCBuiltin<"__nvvm_ll2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2f_rm : GCCBuiltin<"__nvvm_ll2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2f_rp : GCCBuiltin<"__nvvm_ll2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rn : GCCBuiltin<"__nvvm_ull2f_rn">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rz : GCCBuiltin<"__nvvm_ull2f_rz">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rm : GCCBuiltin<"__nvvm_ull2f_rm">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2f_rp : GCCBuiltin<"__nvvm_ull2f_rp">,
+      Intrinsic<[llvm_float_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_ll2d_rn : GCCBuiltin<"__nvvm_ll2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2d_rz : GCCBuiltin<"__nvvm_ll2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2d_rm : GCCBuiltin<"__nvvm_ll2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ll2d_rp : GCCBuiltin<"__nvvm_ll2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rn : GCCBuiltin<"__nvvm_ull2d_rn">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rz : GCCBuiltin<"__nvvm_ull2d_rz">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rm : GCCBuiltin<"__nvvm_ull2d_rm">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_ull2d_rp : GCCBuiltin<"__nvvm_ull2d_rp">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+  def int_nvvm_f2h_rn_ftz : GCCBuiltin<"__nvvm_f2h_rn_ftz">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_f2h_rn : GCCBuiltin<"__nvvm_f2h_rn">,
+      Intrinsic<[llvm_i16_ty], [llvm_float_ty], [IntrNoMem]>;
+
+  def int_nvvm_h2f : GCCBuiltin<"__nvvm_h2f">,
+      Intrinsic<[llvm_float_ty], [llvm_i16_ty], [IntrNoMem]>;
+
+//
+// Bitcast
+//
+
+  def int_nvvm_bitcast_f2i : GCCBuiltin<"__nvvm_bitcast_f2i">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_nvvm_bitcast_i2f : GCCBuiltin<"__nvvm_bitcast_i2f">,
+      Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_nvvm_bitcast_ll2d : GCCBuiltin<"__nvvm_bitcast_ll2d">,
+      Intrinsic<[llvm_double_ty], [llvm_i64_ty], [IntrNoMem]>;
+  def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">,
+      Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+
+
+// Atomic not available as an llvm intrinsic.
+  def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
+          [LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
+                                      [IntrReadWriteArgMem, NoCapture<0>]>;
+  def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
+          [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
+                                      [IntrReadWriteArgMem, NoCapture<0>]>;
+  def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
+          [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
+                                      [IntrReadWriteArgMem, NoCapture<0>]>;
+
+// Bar.Sync
+  def int_cuda_syncthreads : GCCBuiltin<"__syncthreads">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+  def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+  def int_nvvm_barrier0_or : GCCBuiltin<"__nvvm_bar0_or">,
+      Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+
+  // Membar
+  def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_membar_gl : GCCBuiltin<"__nvvm_membar_gl">,
+      Intrinsic<[], [], []>;
+  def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
+      Intrinsic<[], [], []>;
+
+
+// Accessing special registers
+  def int_nvvm_read_ptx_sreg_tid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_tid_x">;
+  def int_nvvm_read_ptx_sreg_tid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_tid_y">;
+  def int_nvvm_read_ptx_sreg_tid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_tid_z">;
+
+  def int_nvvm_read_ptx_sreg_ntid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ntid_x">;
+  def int_nvvm_read_ptx_sreg_ntid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ntid_y">;
+  def int_nvvm_read_ptx_sreg_ntid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ntid_z">;
+
+  def int_nvvm_read_ptx_sreg_ctaid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ctaid_x">;
+  def int_nvvm_read_ptx_sreg_ctaid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ctaid_y">;
+  def int_nvvm_read_ptx_sreg_ctaid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_ctaid_z">;
+
+  def int_nvvm_read_ptx_sreg_nctaid_x :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_nctaid_x">;
+  def int_nvvm_read_ptx_sreg_nctaid_y :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_nctaid_y">;
+  def int_nvvm_read_ptx_sreg_nctaid_z :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_nctaid_z">;
+
+  def int_nvvm_read_ptx_sreg_warpsize :
+      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+      GCCBuiltin<"__nvvm_read_ptx_sreg_warpsize">;
+
+
+// Generated within nvvm. Use for ldu on sm_20 or later
+// @TODO: Revisit this, Changed LLVMAnyPointerType to LLVMPointerType
+def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldu.global.i">;
+def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldu.global.f">;
+def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldu.global.p">;
+
+
+// Use for generic pointers
+// - These intrinsics are used to convert address spaces.
+// - The input pointer and output pointer must have the same type, except for
+//   the address-space. (This restriction is not enforced here as there is
+//   currently no way to describe it).
+// - This complements the llvm bitcast, which can be used to cast one type
+//   of pointer to another type of pointer, while the address space remains
+//   the same.
+def int_nvvm_ptr_local_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.local.to.gen">;
+def int_nvvm_ptr_shared_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.shared.to.gen">;
+def int_nvvm_ptr_global_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.global.to.gen">;
+def int_nvvm_ptr_constant_to_gen: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.constant.to.gen">;
+
+def int_nvvm_ptr_gen_to_global: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.global">;
+def int_nvvm_ptr_gen_to_shared: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.shared">;
+def int_nvvm_ptr_gen_to_local: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.local">;
+def int_nvvm_ptr_gen_to_constant: Intrinsic<[llvm_anyptr_ty],
+                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 "llvm.nvvm.ptr.gen.to.constant">;
+
+// Used in nvvm internally to help address space opt and ptx code generation
+// This is for params that are passed to kernel functions by pointer by-val.
+def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
+                                     [llvm_anyptr_ty],
+                                   [IntrNoMem, NoCapture<0>],
+                                   "llvm.nvvm.ptr.gen.to.param">;
+
+// Move intrinsics, used in nvvm internally
+
+def int_nvvm_move_i8 : Intrinsic<[llvm_i8_ty], [llvm_i8_ty], [IntrNoMem],
+  "llvm.nvvm.move.i8">;
+def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem],
+  "llvm.nvvm.move.i16">;
+def int_nvvm_move_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem],
+  "llvm.nvvm.move.i32">;
+def int_nvvm_move_i64 : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem],
+  "llvm.nvvm.move.i64">;
+def int_nvvm_move_float : Intrinsic<[llvm_float_ty], [llvm_float_ty],
+  [IntrNoMem], "llvm.nvvm.move.float">;
+def int_nvvm_move_double : Intrinsic<[llvm_double_ty], [llvm_double_ty],
+  [IntrNoMem], "llvm.nvvm.move.double">;
+def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
+  [IntrNoMem, NoCapture<0>], "llvm.nvvm.move.ptr">;
+
+
+/// Error / Warn
+def int_nvvm_compiler_error :
+    Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.error">;
+def int_nvvm_compiler_warn :
+    Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">;
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@ -40,6 +40,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
  case mblaze:  return "mblaze";
  case ptx32:   return "ptx32";
  case ptx64:   return "ptx64";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx64";
  case le32:    return "le32";
  case amdil:   return "amdil";
  }
@ -76,6 +78,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {

  case ptx32:   return "ptx";
  case ptx64:   return "ptx";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx";
  case le32:    return "le32";
  case amdil:   return "amdil";
  }
@ -162,6 +166,8 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
    .Case("xcore", xcore)
    .Case("ptx32", ptx32)
    .Case("ptx64", ptx64)
+    .Case("nvptx", nvptx)
+    .Case("nvptx64", nvptx64)
    .Case("le32", le32)
    .Case("amdil", amdil)
    .Default(UnknownArch);
@ -194,6 +200,8 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
    .Case("r600", Triple::r600)
    .Case("ptx32", Triple::ptx32)
    .Case("ptx64", Triple::ptx64)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
    .Case("amdil", Triple::amdil)
    .Default(Triple::UnknownArch);
 }
@ -217,6 +225,8 @@ const char *Triple::getArchNameForAssembler() {
    .Case("r600", "r600")
    .Case("ptx32", "ptx32")
    .Case("ptx64", "ptx64")
+    .Case("nvptx", "nvptx")
+    .Case("nvptx64", "nvptx64")
    .Case("le32", "le32")
    .Case("amdil", "amdil")
    .Default(NULL);
@ -251,6 +261,8 @@ static Triple::ArchType parseArch(StringRef ArchName) {
    .Case("xcore", Triple::xcore)
    .Case("ptx32", Triple::ptx32)
    .Case("ptx64", Triple::ptx64)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
    .Case("le32", Triple::le32)
    .Case("amdil", Triple::amdil)
    .Default(Triple::UnknownArch);
@ -652,6 +664,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
  case llvm::Triple::mblaze:
  case llvm::Triple::mips:
  case llvm::Triple::mipsel:
+  case llvm::Triple::nvptx:
  case llvm::Triple::ppc:
  case llvm::Triple::ptx32:
  case llvm::Triple::r600:
@ -664,6 +677,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {

  case llvm::Triple::mips64:
  case llvm::Triple::mips64el:
+  case llvm::Triple::nvptx64:
  case llvm::Triple::ppc64:
  case llvm::Triple::ptx64:
  case llvm::Triple::sparcv9:
@ -701,6 +715,7 @@ Triple Triple::get32BitArchVariant() const {
  case Triple::mblaze:
  case Triple::mips:
  case Triple::mipsel:
+  case Triple::nvptx:
  case Triple::ppc:
  case Triple::ptx32:
  case Triple::r600:
@ -714,6 +729,7 @@ Triple Triple::get32BitArchVariant() const {

  case Triple::mips64:    T.setArch(Triple::mips);    break;
  case Triple::mips64el:  T.setArch(Triple::mipsel);  break;
+  case Triple::nvptx64:   T.setArch(Triple::nvptx);   break;
  case Triple::ppc64:     T.setArch(Triple::ppc);   break;
  case Triple::ptx64:     T.setArch(Triple::ptx32);   break;
  case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
@ -742,6 +758,7 @@ Triple Triple::get64BitArchVariant() const {

  case Triple::mips64:
  case Triple::mips64el:
+  case Triple::nvptx64:
  case Triple::ppc64:
  case Triple::ptx64:
  case Triple::sparcv9:
@ -751,6 +768,7 @@ Triple Triple::get64BitArchVariant() const {

  case Triple::mips:    T.setArch(Triple::mips64);    break;
  case Triple::mipsel:  T.setArch(Triple::mips64el);  break;
+  case Triple::nvptx:   T.setArch(Triple::nvptx64);   break;
  case Triple::ppc:     T.setArch(Triple::ppc64);     break;
  case Triple::ptx32:   T.setArch(Triple::ptx64);     break;
  case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;

 [common]
-subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore
+subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 NVPTX Mips PTX PowerPC Sparc X86 XCore

 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@ -0,0 +1,33 @@
+set(LLVM_TARGET_DEFINITIONS NVPTX.td)
+
+
+tablegen(LLVM NVPTXGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM NVPTXGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM NVPTXGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM NVPTXGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(NVPTXCommonTableGen)
+
+set(NVPTXCodeGen_sources
+  NVPTXFrameLowering.cpp
+  NVPTXInstrInfo.cpp
+  NVPTXISelDAGToDAG.cpp
+  NVPTXISelLowering.cpp
+  NVPTXRegisterInfo.cpp
+  NVPTXSubtarget.cpp
+  NVPTXTargetMachine.cpp
+  NVPTXSplitBBatBar.cpp
+  NVPTXLowerAggrCopies.cpp
+  NVPTXutil.cpp
+  NVPTXAllocaHoisting.cpp
+  NVPTXAsmPrinter.cpp
+  NVPTXUtilities.cpp
+  VectorElementize.cpp
+  )
+
+add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
+
+
+add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
--- a/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
+++ b/lib/Target/NVPTX/InstPrinter/CMakeLists.txt
@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMNVPTXAsmPrinter
+  NVPTXInstPrinter.cpp
+  )
+
+add_dependencies(LLVMNVPTXAsmPrinter NVPTXCommonTableGen)
--- a/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/NVPTX/InstPrinter/LLVMBuild.txt
@ -0,0 +1,23 @@
+;===- ./lib/Target/NVPTX/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = NVPTXAsmPrinter
+parent = NVPTX
+required_libraries = MC Support
+add_to_library_groups = NVPTX
--- a/lib/Target/NVPTX/InstPrinter/Makefile
+++ b/lib/Target/NVPTX/InstPrinter/Makefile
@ -0,0 +1,15 @@
+##===- lib/Target/NVPTX/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#											The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMNVPTXAsmPrinter
+
+# Hack: we need to include 'main' ptx target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@ -0,0 +1 @@
+// Placeholder
--- a/lib/Target/NVPTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt
@ -0,0 +1,32 @@
+;===- ./lib/Target/NVPTX/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = NVPTX
+parent = Target
+has_asmprinter = 1
+
+[component_1]
+type = Library
+name = NVPTXCodeGen
+parent = NVPTX
+required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXDesc NVPTXInfo SelectionDAG Support Target TransformUtils
+add_to_library_groups = NVPTX
--- a/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/NVPTX/MCTargetDesc/CMakeLists.txt
@ -0,0 +1,9 @@
+add_llvm_library(LLVMNVPTXDesc
+  NVPTXMCAsmInfo.cpp
+  NVPTXMCTargetDesc.cpp
+  )
+
+add_dependencies(LLVMNVPTXDesc NVPTXCommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+#include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
--- a/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt
@ -0,0 +1,23 @@
+;===- ./lib/Target/NVPTX/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = NVPTXDesc
+parent = NVPTX
+required_libraries = MC NVPTXAsmPrinter NVPTXInfo Support
+add_to_library_groups = NVPTX
--- a/lib/Target/NVPTX/MCTargetDesc/Makefile
+++ b/lib/Target/NVPTX/MCTargetDesc/Makefile
@ -0,0 +1,16 @@
+##===- lib/Target/NVPTX/TargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMNVPTXDesc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@ -0,0 +1,88 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXBASEINFO_H
+#define NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+  ADDRESS_SPACE_GENERIC = 0,
+  ADDRESS_SPACE_GLOBAL = 1,
+  ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space
+  ADDRESS_SPACE_SHARED = 3,
+  ADDRESS_SPACE_CONST = 4,
+  ADDRESS_SPACE_LOCAL = 5,
+
+  // NVVM Internal
+  ADDRESS_SPACE_PARAM = 101
+};
+
+enum PropertyAnnotation {
+  PROPERTY_MAXNTID_X = 0,
+  PROPERTY_MAXNTID_Y,
+  PROPERTY_MAXNTID_Z,
+  PROPERTY_REQNTID_X,
+  PROPERTY_REQNTID_Y,
+  PROPERTY_REQNTID_Z,
+  PROPERTY_MINNCTAPERSM,
+  PROPERTY_ISTEXTURE,
+  PROPERTY_ISSURFACE,
+  PROPERTY_ISSAMPLER,
+  PROPERTY_ISREADONLY_IMAGE_PARAM,
+  PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISKERNEL_FUNCTION,
+  PROPERTY_ALIGN,
+
+  // last property
+  PROPERTY_LAST
+};
+
+const unsigned AnnotationNameLen = 8; // length of each annotation name
+const char
+PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
+  "maxntidx",               // PROPERTY_MAXNTID_X
+  "maxntidy",               // PROPERTY_MAXNTID_Y
+  "maxntidz",               // PROPERTY_MAXNTID_Z
+  "reqntidx",               // PROPERTY_REQNTID_X
+  "reqntidy",               // PROPERTY_REQNTID_Y
+  "reqntidz",               // PROPERTY_REQNTID_Z
+  "minctasm",               // PROPERTY_MINNCTAPERSM
+  "texture",                // PROPERTY_ISTEXTURE
+  "surface",                // PROPERTY_ISSURFACE
+  "sampler",                // PROPERTY_ISSAMPLER
+  "rdoimage",               // PROPERTY_ISREADONLY_IMAGE_PARAM
+  "wroimage",               // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "kernel",                 // PROPERTY_ISKERNEL_FUNCTION
+  "align",                  // PROPERTY_ALIGN
+
+  // last property
+  "proplast",               // PROPERTY_LAST
+};
+
+// name of named metadata used for global annotations
+#if defined(__GNUC__)
+// As this is declared to be static but some of the .cpp files that
+// include NVVM.h do not use this array, gcc gives a warning when
+// compiling those .cpp files, hence __attribute__((unused)).
+__attribute__((unused))
+#endif
+static const char* NamedMDForAnnotations = "nvvm.annotations";
+
+}
+
+
+#endif
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@ -0,0 +1,63 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+bool CompileForDebugging;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool, true>
+Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden,
+      cl::location(CompileForDebugging),
+      cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() { }
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::nvptx64)
+    PointerSize = 8;
+
+  CommentString = "//";
+
+  PrivateGlobalPrefix = "$L__";
+
+  AllowPeriodsInName = false;
+
+  HasSetDirective = false;
+
+  HasSingleParameterDotFile = false;
+
+  InlineAsmStart = " inline asm";
+  InlineAsmEnd = " inline asm";
+
+  SupportsDebugInformation = CompileForDebugging;
+  HasDotTypeDotSizeDirective = false;
+
+  Data8bitsDirective = " .b8 ";
+  Data16bitsDirective = " .b16 ";
+  Data32bitsDirective = " .b32 ";
+  Data64bitsDirective = " .b64 ";
+  PrivateGlobalPrefix = "";
+  ZeroDirective =  " .b8";
+  AsciiDirective = " .b8";
+  AscizDirective = " .b8";
+
+  // @TODO: Can we just disable this?
+  GlobalDirective = "\t// .globl\t";
+}
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@ -0,0 +1,30 @@
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVPTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_MCASM_INFO_H
+#define NVPTX_MCASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+
+class NVPTXMCAsmInfo : public MCAsmInfo {
+  virtual void anchor();
+public:
+  explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT);
+};
+} // namespace llvm
+
+#endif // NVPTX_MCASM_INFO_H
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@ -0,0 +1,91 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+
+using namespace llvm;
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitNVPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitNVPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32,
+                                        createNVPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64,
+                                        createNVPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32,
+                                    createNVPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64,
+                                    createNVPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32,
+                                          createNVPTXMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
+                                          createNVPTXMCSubtargetInfo);
+
+}
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@ -0,0 +1,36 @@
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXMCTARGETDESC_H
+#define NVPTXMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "NVPTXGenSubtargetInfo.inc"
+
+#endif
--- a/lib/Target/NVPTX/Makefile
+++ b/lib/Target/NVPTX/Makefile
@ -0,0 +1,23 @@
+##===- lib/Target/NVPTX/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMNVPTXCodeGen
+TARGET = NVPTX
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = NVPTXGenAsmWriter.inc \
+		NVPTXGenDAGISel.inc \
+		NVPTXGenInstrInfo.inc \
+		NVPTXGenRegisterInfo.inc \
+		NVPTXGenSubtargetInfo.inc
+
+DIRS = InstPrinter TargetInfo MCTargetDesc
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/NVPTX/ManagedStringPool.h
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@ -0,0 +1,49 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_SUPPORT_MANAGED_STRING_H
+#define LLVM_SUPPORT_MANAGED_STRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+  SmallVector<std::string *, 8> Pool;
+
+public:
+  ManagedStringPool() {}
+  ~ManagedStringPool() {
+    SmallVector<std::string *, 8>::iterator Current = Pool.begin();
+    while (Current != Pool.end()) {
+      delete *Current;
+      Current++;
+    }
+  }
+
+  std::string *getManagedString(const char *S) {
+    std::string *Str = new std::string(S);
+    Pool.push_back(Str);
+    return Str;
+  }
+};
+
+}
+
+#endif
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@ -0,0 +1,137 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_H
+#define LLVM_TARGET_NVPTX_H
+
+#include <cassert>
+#include <iosfwd>
+#include "llvm/Value.h"
+#include "llvm/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
+}
+
+inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code");
+  case NVPTXCC::NE:  return "ne";
+  case NVPTXCC::EQ:   return "eq";
+  case NVPTXCC::LT:   return "lt";
+  case NVPTXCC::LE:  return "le";
+  case NVPTXCC::GT:  return "gt";
+  case NVPTXCC::GE:   return "ge";
+  }
+}
+
+FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                 llvm::CodeGenOpt::Level OptLevel);
+FunctionPass *createVectorElementizePass(NVPTXTargetMachine &);
+FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+
+bool isImageOrSamplerVal(const Value *, const Module *);
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+namespace NVPTX
+{
+enum DrvInterface {
+  NVCL,
+  CUDA,
+  TEST
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+  VecInstTypeShift = 0,
+  VecInstTypeMask = 0xF,
+
+  VecNOP = 0,
+  VecLoad = 1,
+  VecStore = 2,
+  VecBuild = 3,
+  VecShuffle = 4,
+  VecExtract = 5,
+  VecInsert = 6,
+  VecDest = 7,
+  VecOther = 15
+};
+
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
+enum LoadStore {
+  isLoadMask = 0x20,
+  isLoadShift = 5,
+  isStoreMask = 0x40,
+  isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace{
+  GENERIC = 0,
+  GLOBAL = 1,
+  CONSTANT = 2,
+  SHARED = 3,
+  PARAM = 4,
+  LOCAL = 5
+};
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@ -0,0 +1,44 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+//   TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_10", [FeatureDummy]>;
+
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+  let InstructionSet = NVPTXInstrInfo;
+}
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@ -0,0 +1,48 @@
+//===-- AllocaHoisting.cpp - Hosist allocas to the entry block --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Constants.h"
+#include "NVPTXAllocaHoisting.h"
+
+namespace llvm {
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+  bool               functionModified    = false;
+  Function::iterator I                   = function.begin();
+  TerminatorInst    *firstTerminatorInst = (I++)->getTerminator();
+
+  for (Function::iterator E = function.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+        allocaInst->moveBefore(firstTerminatorInst);
+        functionModified = true;
+      }
+    }
+  }
+
+  return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 1;
+RegisterPass<NVPTXAllocaHoisting> X("alloca-hoisting",
+                                    "Hoisting alloca instructsion in non-entry "
+                                    "blocks to the entry block");
+
+FunctionPass *createAllocaHoisting() {
+  return new NVPTXAllocaHoisting();
+}
+
+} // end namespace llvm
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@ -0,0 +1,49 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_ALLOCA_HOISTING_H_
+#define NVPTX_ALLOCA_HOISTING_H_
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+class FunctionPass;
+class Function;
+
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+  static char ID; // Pass ID
+  NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetData>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual const char *getPassName() const {
+    return "NVPTX specific alloca hoisting";
+  }
+
+  virtual bool runOnFunction(Function &function);
+};
+
+extern FunctionPass *createAllocaHoisting();
+
+} // end namespace llvm
+
+#endif // NVPTX_ALLOCA_HOISTING_H_
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@ -0,0 +1,318 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXASMPRINTER_H
+#define NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+// This is defined in AsmPrinter.cpp.
+// Used to process the constant expressions in initializers.
+namespace nvptx {
+const llvm::MCExpr *LowerConstant(const llvm::Constant *CV,
+                                  llvm::AsmPrinter &AP) ;
+}
+
+namespace llvm {
+
+class LineReader {
+private:
+  unsigned theCurLine ;
+  std::ifstream fstr;
+  char buff[512];
+  std::string theFileName;
+  SmallVector<unsigned, 32> lineOffset;
+public:
+  LineReader(std::string filename) {
+    theCurLine = 0;
+    fstr.open(filename.c_str());
+    theFileName = filename;
+  }
+  std::string fileName() { return theFileName; }
+  ~LineReader() {
+    fstr.close();
+  }
+  std::string readLine(unsigned line);
+};
+
+
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+
+  class AggBuffer {
+    // Used to buffer the emitted string for initializing global
+    // aggregates.
+    //
+    // Normally an aggregate (array, vector or structure) is emitted
+    // as a u8[]. However, if one element/field of the aggregate
+    // is a non-NULL address, then the aggregate is emitted as u32[]
+    // or u64[].
+    //
+    // We first layout the aggregate in 'buffer' in bytes, except for
+    // those symbol addresses. For the i-th symbol address in the
+    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+    // are filled with 0s. symbolPosInBuffer[i-1] records its position
+    // in 'buffer', and Symbols[i-1] records the Value*.
+    //
+    // Once we have this AggBuffer setup, we can choose how to print
+    // it out.
+  public:
+    unsigned size;   // size of the buffer in bytes
+    unsigned char *buffer; // the buffer
+    unsigned numSymbols;   // number of symbol addresses
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<Value *, 4> Symbols;
+
+  private:
+    unsigned curpos;
+    raw_ostream &O;
+    NVPTXAsmPrinter &AP;
+
+  public:
+    AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
+    :O(_O),AP(_AP) {
+      buffer = new unsigned char[_size];
+      size = _size;
+      curpos = 0;
+      numSymbols = 0;
+    }
+    ~AggBuffer() {
+      delete [] buffer;
+    }
+    unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+      assert((curpos+Num) <= size);
+      assert((curpos+Bytes) <= size);
+      for ( int i= 0; i < Num; ++i) {
+        buffer[curpos] = Ptr[i];
+        curpos ++;
+      }
+      for ( int i=Num; i < Bytes ; ++i) {
+        buffer[curpos] = 0;
+        curpos ++;
+      }
+      return curpos;
+    }
+    unsigned addZeros(int Num) {
+      assert((curpos+Num) <= size);
+      for ( int i= 0; i < Num; ++i) {
+        buffer[curpos] = 0;
+        curpos ++;
+      }
+      return curpos;
+    }
+    void addSymbol(Value *GVar) {
+      symbolPosInBuffer.push_back(curpos);
+      Symbols.push_back(GVar);
+      numSymbols++;
+    }
+    void print() {
+      if (numSymbols == 0) {
+        // print out in bytes
+        for (unsigned i=0; i<size; i++) {
+          if (i)
+            O << ", ";
+          O << (unsigned int)buffer[i];
+        }
+      }
+      else {
+        // print out in 4-bytes or 8-bytes
+        unsigned int pos = 0;
+        unsigned int nSym = 0;
+        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+        unsigned int nBytes = 4;
+        if (AP.nvptxSubtarget.is64Bit())
+          nBytes = 8;
+        for (pos=0; pos<size; pos+=nBytes) {
+          if (pos)
+            O << ", ";
+          if (pos == nextSymbolPos) {
+            Value *v = Symbols[nSym];
+            if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              O << *Name;
+            }
+            else if (ConstantExpr *Cexpr =
+                dyn_cast<ConstantExpr>(v)) {
+              O << *nvptx::LowerConstant(Cexpr, AP);
+            }
+            else
+              assert(0 && "symbol type unknown");
+            nSym++;
+            if (nSym >= numSymbols)
+              nextSymbolPos = size+1;
+            else
+              nextSymbolPos = symbolPosInBuffer[nSym];
+          }
+          else
+            if (nBytes == 4)
+              O << *(unsigned int*)(buffer+pos);
+            else
+              O << *(unsigned long long*)(buffer+pos);
+        }
+      }
+    }
+  };
+
+  friend class AggBuffer;
+
+  virtual void emitSrcInText(StringRef filename, unsigned line);
+
+private :
+  virtual const char *getPassName() const {
+    return "NVPTX Assembly Printer";
+  }
+
+  const Function *F;
+  std::string CurrentFnName;
+
+  void EmitFunctionEntryLabel();
+  void EmitFunctionBodyStart();
+  void EmitFunctionBodyEnd();
+
+  void EmitInstruction(const MachineInstr *);
+
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
+
+  void printGlobalVariable(const GlobalVariable *GVar);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier=0);
+  void printLdStCode(const MachineInstr *MI, int opNum, raw_ostream &O,
+                     const char *Modifier=0);
+  void printVecModifiedImmediate(const MachineOperand &MO,
+                                 const char *Modifier, raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier=0);
+  void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
+  // definition autogenerated.
+  void printInstruction(const MachineInstr *MI, raw_ostream &O);
+  void printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O,
+                          bool=false);
+  void printParamName(int paramIndex, raw_ostream &O);
+  void printParamName(Function::const_arg_iterator I, int paramIndex,
+                      raw_ostream &O);
+  void emitHeader(Module &M, raw_ostream &O);
+  void emitKernelFunctionDirectives(const Function& F,
+                                    raw_ostream &O) const;
+  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitFunctionExternParamList(const MachineFunction &MF);
+  void emitFunctionParamList(const Function *, raw_ostream &O);
+  void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+  void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void emitFunctionTempData(const MachineFunction &MF,
+                            unsigned &FrameSize);
+  bool isImageType(const Type *Ty);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+
+protected:
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+private:
+  std::string CurrentBankselLabelInBasicBlock;
+
+  // This is specific per MachineFunction.
+  const MachineRegisterInfo *MRI;
+  // The contents are specific for each
+  // MachineFunction. But the size of the
+  // array is not.
+  std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+  // cache the subtarget here.
+  const NVPTXSubtarget &nvptxSubtarget;
+  // Build the map between type name and ID based on module's type
+  // symbol table.
+  std::map<const Type *, std::string> TypeNameMap;
+
+  // List of variables demoted to a function scope.
+  std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+
+  // To record filename to ID mapping
+  std::map<std::string, unsigned> filenameMap;
+  void recordAndEmitFilenames(Module &);
+
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXAddressSpace(unsigned int AddressSpace,
+                           raw_ostream &O) const;
+  std::string getPTXFundamentalTypeStr(const Type *Ty, bool=true) const ;
+  void printScalarConstant(Constant *CPV, raw_ostream &O) ;
+  void printFPConstant(const ConstantFP *Fp, raw_ostream &O) ;
+  void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer) ;
+  void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer) ;
+
+  void printOperandProper(const MachineOperand &MO);
+
+  void emitLinkageDirective(const GlobalValue* V, raw_ostream &O);
+  void emitDeclarations(Module &, raw_ostream &O);
+  void emitDeclaration(const Function *, raw_ostream &O);
+
+  static const char *getRegisterName(unsigned RegNo);
+  void emitDemotedVars(const Function *, raw_ostream &);
+
+  LineReader *reader;
+  LineReader *getReader(std::string);
+public:
+  NVPTXAsmPrinter(TargetMachine &TM,
+                  MCStreamer &Streamer)
+  : AsmPrinter(TM, Streamer),
+    nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+    CurrentBankselLabelInBasicBlock = "";
+    VRidGlobal2LocalMap = NULL;
+    reader = NULL;
+  }
+
+  ~NVPTXAsmPrinter() {
+    if (!reader)
+      delete reader;
+  }
+
+  bool ignoreLoc(const MachineInstr &);
+
+  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+
+  DebugLoc prevDebugLoc;
+  void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@ -0,0 +1,76 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const {
+  return true;
+}
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+  if (MF.getFrameInfo()->hasStackObjects()) {
+    MachineBasicBlock &MBB = MF.front();
+    // Insert "mov.u32 %SP, %Depot"
+    MachineBasicBlock::iterator MBBI = MBB.begin();
+    // This instruction really occurs before first instruction
+    // in the BB, so giving it no debug location.
+    DebugLoc dl = DebugLoc();
+
+    if (tm.getSubtargetImpl()->hasGenericLdSt()) {
+      // mov %SPL, %depot;
+      // cvta.local %SP, %SPL;
+      if (is64bit) {
+        MachineInstr *MI = BuildMI(MBB, MBBI, dl,
+                               tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+                                   NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrameLocal)
+        .addReg(NVPTX::VRDepot);
+      } else {
+        MachineInstr *MI = BuildMI(MBB, MBBI, dl,
+                                  tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
+                                   NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrameLocal)
+        .addReg(NVPTX::VRDepot);
+      }
+    }
+    else {
+      // mov %SP, %depot;
+      if (is64bit)
+        BuildMI(MBB, MBBI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV64rr), NVPTX::VRFrame)
+                .addReg(NVPTX::VRDepot);
+      else
+        BuildMI(MBB, MBBI, dl,
+                tm.getInstrInfo()->get(NVPTX::IMOV32rr), NVPTX::VRFrame)
+                .addReg(NVPTX::VRDepot);
+    }
+  }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+}
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@ -0,0 +1,40 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_FRAMELOWERING_H
+#define NVPTX_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+
+namespace llvm {
+class NVPTXTargetMachine;
+
+class NVPTXFrameLowering : public TargetFrameLowering {
+  NVPTXTargetMachine &tm;
+  bool is64bit;
+
+public:
+  explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0),
+    tm(_tm), is64bit(_is64bit) {}
+
+  virtual bool hasFP(const MachineFunction &MF) const;
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF,
+                            MachineBasicBlock &MBB) const;
+};
+
+} // End llvm namespace
+
+#endif
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@ -0,0 +1,681 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Instructions.h"
+#include "llvm/Support/raw_ostream.h"
+#include "NVPTXISelDAGToDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/GlobalValue.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-isel"
+
+using namespace llvm;
+
+
+static cl::opt<bool>
+UseFMADInstruction("nvptx-mad-enable",
+                   cl::ZeroOrMore,
+                cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
+                   cl::init(false));
+
+static cl::opt<int>
+FMAContractLevel("nvptx-fma-level",
+                 cl::ZeroOrMore,
+                 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                     " 1: do it  2: do it aggressively"),
+                     cl::init(2));
+
+
+static cl::opt<int>
+UsePrecDivF32("nvptx-prec-divf32",
+              cl::ZeroOrMore,
+             cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+                  " IEEE Compliant F32 div.rnd if avaiable."),
+                  cl::init(2));
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                       llvm::CodeGenOpt::Level OptLevel) {
+  return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                                     CodeGenOpt::Level OptLevel)
+: SelectionDAGISel(tm, OptLevel),
+  Subtarget(tm.getSubtarget<NVPTXSubtarget>())
+{
+  // Always do fma.f32 fpcontract if the target supports the instruction.
+  // Always do fma.f64 fpcontract if the target supports the instruction.
+  // Do mad.f32 is nvptx-mad-enable is specified and the target does not
+  // support fma.f32.
+
+  doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
+  doFMAF32 =  (OptLevel > 0) && Subtarget.hasFMAF32() &&
+      (FMAContractLevel>=1);
+  doFMAF64 =  (OptLevel > 0) && Subtarget.hasFMAF64() &&
+      (FMAContractLevel>=1);
+  doFMAF32AGG =  (OptLevel > 0) && Subtarget.hasFMAF32() &&
+      (FMAContractLevel==2);
+  doFMAF64AGG =  (OptLevel > 0) && Subtarget.hasFMAF64() &&
+      (FMAContractLevel==2);
+
+  allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
+
+  doMulWide = (OptLevel > 0);
+
+  // Decide how to translate f32 div
+  do_DIVF32_PREC = UsePrecDivF32;
+  // sm less than sm_20 does not support div.rnd. Use div.full.
+  if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
+    do_DIVF32_PREC = 1;
+
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode* NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  SDNode *ResNode = NULL;
+  switch (N->getOpcode()) {
+  case ISD::LOAD:
+    ResNode = SelectLoad(N);
+    break;
+  case ISD::STORE:
+    ResNode = SelectStore(N);
+    break;
+  }
+  if (ResNode)
+    return ResNode;
+  return SelectCode(N);
+}
+
+
+static unsigned int
+getCodeAddrSpace(MemSDNode *N, const NVPTXSubtarget &Subtarget)
+{
+  const Value *Src = N->getSrcValue();
+  if (!Src)
+    return NVPTX::PTXLdStInstCode::LOCAL;
+
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+    switch (PT->getAddressSpace()) {
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+      return NVPTX::PTXLdStInstCode::CONSTANT;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST:
+      // If the arch supports generic address space, translate it to GLOBAL
+      // for correctness.
+      // If the arch does not support generic address space, then the arch
+      // does not really support ADDRESS_SPACE_CONST, translate it to
+      // to CONSTANT for better performance.
+      if (Subtarget.hasGenericLdSt())
+        return NVPTX::PTXLdStInstCode::GLOBAL;
+      else
+        return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
+    }
+  }
+  return NVPTX::PTXLdStInstCode::LOCAL;
+}
+
+
+SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDNode *NVPTXLD= NULL;
+
+  // do not support pre/post inc/dec
+  if (LD->isIndexed())
+    return NULL;
+
+  if (!LoadedVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = LD->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned fromTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int fromType;
+  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+    fromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    fromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N1, Addr)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_avar; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_avar; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_avar; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_avar; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_avar; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_avar; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_avar; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_avar; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_avar; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_avar; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_avar; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_avar; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_avar; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_avar; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_avar; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_avar; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Addr, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 7);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRsi64(N1.getNode(), N1, Base, Offset):
+      SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_asi; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_asi; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_asi; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_asi; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_asi; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_asi; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_asi; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_asi; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_asi; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_asi; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_asi; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_asi; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_asi; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_asi; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_asi; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_asi; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRri64(N1.getNode(), N1, Base, Offset):
+      SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_ari; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_ari; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_ari; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_ari; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_ari; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_ari; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_ari; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_ari; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_ari; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_ari; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_ari; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_ari; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_ari; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_ari; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_ari; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_ari; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 8);
+  }
+  else {
+    switch (TargetVT) {
+    case MVT::i8:    Opcode = NVPTX::LD_i8_areg; break;
+    case MVT::i16:   Opcode = NVPTX::LD_i16_areg; break;
+    case MVT::i32:   Opcode = NVPTX::LD_i32_areg; break;
+    case MVT::i64:   Opcode = NVPTX::LD_i64_areg; break;
+    case MVT::f32:   Opcode = NVPTX::LD_f32_areg; break;
+    case MVT::f64:   Opcode = NVPTX::LD_f64_areg; break;
+    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_areg; break;
+    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_areg; break;
+    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_areg; break;
+    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_areg; break;
+    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_areg; break;
+    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_areg; break;
+    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_areg; break;
+    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_areg; break;
+    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_areg; break;
+    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_areg; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth),
+                      N1, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT,
+                                     MVT::Other, Ops, 7);
+  }
+
+  if (NVPTXLD != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXLD;
+}
+
+SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  EVT StoreVT = ST->getMemoryVT();
+  SDNode *NVPTXST = NULL;
+
+  // do not support pre/post inc/dec
+  if (ST->isIndexed())
+    return NULL;
+
+  if (!StoreVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = ST->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = StoreVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  //
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned toTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int toType;
+  if (ScalarVT.isFloatingPoint())
+    toType = NVPTX::PTXLdStInstCode::Float;
+  else
+    toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType SourceVT =
+      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_avar; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_avar; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_avar; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_avar; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_avar; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_avar; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_avar; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_avar; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_avar; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_avar; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_avar; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_avar; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_avar; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_avar; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_avar; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_avar; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Addr, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRsi64(N2.getNode(), N2, Base, Offset):
+      SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_asi; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_asi; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_asi; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_asi; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_asi; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_asi; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_asi; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_asi; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_asi; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_asi; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_asi; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_asi; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_asi; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_asi; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_asi; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_asi; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 9);
+  } else if (Subtarget.is64Bit()?
+      SelectADDRri64(N2.getNode(), N2, Base, Offset):
+      SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_ari; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_ari; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_ari; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_ari; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_ari; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_ari; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_ari; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_ari; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_ari; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_ari; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_ari; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_ari; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_ari; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_ari; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_ari; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_ari; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 9);
+  } else {
+    switch (SourceVT) {
+    case MVT::i8:    Opcode = NVPTX::ST_i8_areg; break;
+    case MVT::i16:   Opcode = NVPTX::ST_i16_areg; break;
+    case MVT::i32:   Opcode = NVPTX::ST_i32_areg; break;
+    case MVT::i64:   Opcode = NVPTX::ST_i64_areg; break;
+    case MVT::f32:   Opcode = NVPTX::ST_f32_areg; break;
+    case MVT::f64:   Opcode = NVPTX::ST_f64_areg; break;
+    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_areg; break;
+    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_areg; break;
+    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_areg; break;
+    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_areg; break;
+    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_areg; break;
+    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_areg; break;
+    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_areg; break;
+    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_areg; break;
+    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_areg; break;
+    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_areg; break;
+    default: return NULL;
+    }
+    SDValue Ops[] = { N1,
+                      getI32Imm(isVolatile),
+                      getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType),
+                      getI32Imm(toType),
+                      getI32Imm(toTypeWidth),
+                      N2, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl,
+                                     MVT::Other, Ops, 8);
+  }
+
+  if (NVPTXST != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXST;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress
+      || N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+  if (N.getOpcode() == NVPTXISD::Wrapper) {
+    Address = N.getOperand(0);
+    return true;
+  }
+  if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
+    if (IID == Intrinsic::nvvm_ptr_gen_to_param)
+      if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
+        return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT mvt) {
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      SDValue base=Addr.getOperand(0);
+      if (SelectDirectAddr(base, Base)) {
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(SDNode *OpNode, SDValue Addr,
+                                         SDValue &Base, SDValue &Offset,
+                                         MVT mvt) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+    Offset = CurDAG->getTargetConstant(0, mvt);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+      return false;
+    }
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (FrameIndexSDNode *FIN =
+          dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+      else
+        Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+      return true;
+    }
+  }
+  return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+                                                 unsigned int spN) const {
+  const Value *Src = NULL;
+  // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
+  // the classof() for MemSDNode does not include MemIntrinsicSDNode
+  // (See SelectionDAGNodes.h). So we need to check for both.
+  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  if (!Src)
+    return false;
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+    return (PT->getAddressSpace() == spN);
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                     char ConstraintCode,
+                                                 std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (SelectDirectAddr(Op, Op0)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32));
+      return false;
+    }
+    if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    break;
+  }
+  return true;
+}
+
+// Return true if N is a undef or a constant.
+// If N was undef, return a (i8imm 0) in Retval
+// If N was imm, convert it to i8imm and return in Retval
+// Note: The convert to i8imm is required, otherwise the
+// pattern matcher inserts a bunch of IMOVi8rr to convert
+// the imm to i8imm, and this causes instruction selection
+// to fail.
+bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N,
+                                   SDValue &Retval) {
+  if (!(N.getOpcode() == ISD::UNDEF) &&
+      !(N.getOpcode() == ISD::Constant))
+    return false;
+
+  if (N.getOpcode() == ISD::UNDEF)
+    Retval = CurDAG->getTargetConstant(0, MVT::i8);
+  else {
+    ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode());
+    unsigned retval = cn->getZExtValue();
+    Retval = CurDAG->getTargetConstant(retval, MVT::i8);
+  }
+  return true;
+}
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@ -0,0 +1,105 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-isel"
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Intrinsics.h"
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+
+  // If true, generate corresponding FPCONTRACT. This is
+  // language dependent (i.e. CUDA and OpenCL works differently).
+  bool doFMADF32;
+  bool doFMAF64;
+  bool doFMAF32;
+  bool doFMAF64AGG;
+  bool doFMAF32AGG;
+  bool allowFMA;
+
+  // 0: use div.approx
+  // 1: use div.full
+  // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
+  //    Otherwise, use div.full
+  int do_DIVF32_PREC;
+
+  // If true, add .ftz to f32 instructions.
+  // This is only meaningful for sm_20 and later, as the default
+  // is not ftz.
+  // For sm earlier than sm_20, f32 denorms are always ftz by the
+  // hardware.
+  // We always add the .ftz modifier regardless of the sm value
+  // when Use32FTZ is true.
+  bool UseF32FTZ;
+
+  // If true, generate mul.wide from sext and mul
+  bool doMulWide;
+
+public:
+  explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                             CodeGenOpt::Level OptLevel);
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "NVPTX DAG->DAG Pattern Instruction Selection";
+  }
+
+  const NVPTXSubtarget &Subtarget;
+
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+private:
+  // Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+  SDNode *Select(SDNode *N);
+  SDNode* SelectLoad(SDNode *N);
+  SDNode* SelectStore(SDNode *N);
+
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+  bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+
+  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+  bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval);
+
+};
+}
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@ -0,0 +1,153 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXISELLOWERING_H
+#define NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType {
+  // Start the numbering from where ISD NodeType finishes.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  Wrapper,
+  CALL,
+  RET_FLAG,
+  LOAD_PARAM,
+  NVBuiltin,
+  DeclareParam,
+  DeclareScalarParam,
+  DeclareRetParam,
+  DeclareRet,
+  DeclareScalarRet,
+  LoadParam,
+  StoreParam,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently
+  MoveToParam,
+  PrintCall,
+  PrintCallUni,
+  CallArgBegin,
+  CallArg,
+  LastCallArg,
+  CallArgEnd,
+  CallVoid,
+  CallVal,
+  CallSymbol,
+  Prototype,
+  MoveParam,
+  MoveRetval,
+  MoveToRetval,
+  StoreRetval,
+  PseudoUseParam,
+  RETURN,
+  CallSeqBegin,
+  CallSeqEnd,
+  Dummy
+};
+}
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+  explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
+                             SelectionDAG &DAG) const;
+
+  virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  bool isTypeSupportedInIntrinsic(MVT VT) const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo& Info, const CallInst &I,
+                          unsigned Intrinsic) const;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type
+  /// Used to guide target specific optimizations, like loop strength
+  /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+  /// address mode (CodeGenPrepare.cpp)
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  virtual EVT getSetCCResultType(EVT VT) const {
+    return MVT::i1;
+  }
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+  std::pair<unsigned, const TargetRegisterClass*>
+  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+  virtual SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+
+  // This will be re-added once the necessary changes to LowerCallTo are
+  // upstreamed.
+  // virtual SDValue
+  // LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+  // bool isVarArg, bool doesNotRet, bool &isTailCall,
+  // const SmallVectorImpl<ISD::OutputArg> &Outs,
+  // const SmallVectorImpl<SDValue> &OutVals,
+  // const SmallVectorImpl<ISD::InputArg> &Ins,
+  // DebugLoc dl, SelectionDAG &DAG,
+  // SmallVectorImpl<SDValue> &InVals,
+  // Type *retTy, const ArgListTy &Args) const;
+
+  std::string getPrototype(Type *, const ArgListTy &,
+                           const SmallVectorImpl<ISD::OutputArg> &,
+                           unsigned retAlignment) const;
+
+  virtual SDValue
+  LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+              const SmallVectorImpl<ISD::OutputArg> &Outs,
+              const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+              SelectionDAG &DAG) const;
+
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                            std::vector<SDValue> &Ops,
+                                            SelectionDAG &DAG) const;
+
+  NVPTXTargetMachine *nvTM;
+
+  // PTX always uses 32-bit shift amounts
+  virtual MVT getShiftAmountTy(EVT LHSTy) const {
+    return MVT::i32;
+  }
+
+private:
+  const NVPTXSubtarget &nvptxSubtarget;  // cache the subtarget here
+
+  SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx, EVT =
+                         MVT::i32) const;
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT = MVT::i32) const;
+  SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
+
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+};
+} // namespace llvm
+
+#endif // NVPTXISELLOWERING_H
--- a/lib/Target/NVPTX/NVPTXInstrFormats.td
+++ b/lib/Target/NVPTX/NVPTXInstrFormats.td
@ -0,0 +1,43 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+  bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "NVPTX";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  // TSFlagFields
+  bits<4> VecInstType = VecNOP.Value;
+  bit IsSimpleMove = 0;
+  bit IsLoad = 0;
+  bit IsStore = 0;
+
+  let TSFlags{3-0} = VecInstType;
+  let TSFlags{4-4} = IsSimpleMove;
+  let TSFlags{5-5} = IsLoad;
+  let TSFlags{6-6} = IsStore;
+}
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@ -0,0 +1,326 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#define GET_INSTRINFO_CTOR
+#include "NVPTXGenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include <cstdio>
+
+
+using namespace llvm;
+
+// FIXME: Add the subtarget support on this constructor.
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
+: NVPTXGenInstrInfo(),
+  TM(tm),
+  RegInfo(*this, *TM.getSubtargetImpl()) {}
+
+
+void NVPTXInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
+      NVPTX::Int32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
+      NVPTX::Int8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
+      NVPTX::Int1RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
+      NVPTX::Float32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
+      NVPTX::Int16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
+      NVPTX::Int64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
+      NVPTX::Float64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4F32RegsRegClass.contains(DestReg) &&
+      NVPTX::V4F32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4f32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I32RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2F32RegsRegClass.contains(DestReg) &&
+      NVPTX::V2F32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2f32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I32RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i32Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I8RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i8Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I8RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i8Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V4I16RegsRegClass.contains(DestReg) &&
+      NVPTX::V4I16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V4i16Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I16RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i16Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2I64RegsRegClass.contains(DestReg) &&
+      NVPTX::V2I64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2i64Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::V2F64RegsRegClass.contains(DestReg) &&
+      NVPTX::V2F64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::V2f64Mov), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  else {
+    assert(0 && "Don't know how to copy a register");
+  }
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg,
+                                 unsigned &DestReg) const {
+  // Look for the appropriate part of TSFlags
+  bool isMove = false;
+
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >>
+      NVPTX::SimpleMoveShift;
+  isMove = (TSFlags == 1);
+
+  if (isMove) {
+    MachineOperand dest = MI.getOperand(0);
+    MachineOperand src = MI.getOperand(1);
+    assert(dest.isReg() && "dest of a movrr is not a reg");
+    assert(src.isReg() && "src of a movrr is not a reg");
+
+    SrcReg = src.getReg();
+    DestReg = dest.getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool  NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const
+{
+  switch (MI.getOpcode()) {
+  default: return false;
+  case NVPTX::INT_PTX_SREG_NTID_X:
+  case NVPTX::INT_PTX_SREG_NTID_Y:
+  case NVPTX::INT_PTX_SREG_NTID_Z:
+  case NVPTX::INT_PTX_SREG_TID_X:
+  case NVPTX::INT_PTX_SREG_TID_Y:
+  case NVPTX::INT_PTX_SREG_TID_Z:
+  case NVPTX::INT_PTX_SREG_CTAID_X:
+  case NVPTX::INT_PTX_SREG_CTAID_Y:
+  case NVPTX::INT_PTX_SREG_CTAID_Z:
+  case NVPTX::INT_PTX_SREG_NCTAID_X:
+  case NVPTX::INT_PTX_SREG_NCTAID_Y:
+  case NVPTX::INT_PTX_SREG_NCTAID_Z:
+  case NVPTX::INT_PTX_SREG_WARPSIZE:
+    return true;
+  }
+}
+
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+                                 unsigned &AddrSpace) const {
+  bool isLoad = false;
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isLoadMask) >>
+      NVPTX::isLoadShift;
+  isLoad = (TSFlags == 1);
+  if (isLoad)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+                                  unsigned &AddrSpace) const {
+  bool isStore = false;
+  unsigned TSFlags = (MI.getDesc().TSFlags & NVPTX::isStoreMask) >>
+      NVPTX::isStoreShift;
+  isStore = (TSFlags == 1);
+  if (isStore)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isStore;
+}
+
+
+bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
+  unsigned addrspace = 0;
+  if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+    return false;
+  if (isLoadInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  if (isStoreInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  return true;
+}
+
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+  if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != NVPTX::CBranch)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+NVPTXInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "NVPTX branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, DL, get(NVPTX::CBranch))
+      .addReg(Cond[0].getReg()).addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, DL, get(NVPTX::CBranch))
+  .addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+  return 2;
+}
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@ -0,0 +1,83 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXINSTRUCTIONINFO_H
+#define NVPTXINSTRUCTIONINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo
+{
+  NVPTXTargetMachine &TM;
+  const NVPTXRegisterInfo RegInfo;
+public:
+  explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
+
+  virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /* The following virtual functions are used in register allocation.
+   * They are not implemented because the existing interface and the logic
+   * at the caller side do not work for the elementized vector load and store.
+   *
+   * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+   *                                  int &FrameIndex) const;
+   * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+   *                                 int &FrameIndex) const;
+   * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+   *                              MachineBasicBlock::iterator MBBI,
+   *                             unsigned SrcReg, bool isKill, int FrameIndex,
+   *                              const TargetRegisterClass *RC) const;
+   * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+   *                               MachineBasicBlock::iterator MBBI,
+   *                               unsigned DestReg, int FrameIndex,
+   *                               const TargetRegisterClass *RC) const;
+   */
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const ;
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg,
+                           unsigned &DestReg) const;
+  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isReadSpecialReg(MachineInstr &MI) const;
+
+  virtual bool CanTailMerge(const MachineInstr *MI) const ;
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+    return  MI.getOperand(2).getImm();
+  }
+
+};
+
+} // namespace llvm
+
+#endif
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@ -0,0 +1,208 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/IRBuilder.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/LLVMContext.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createLowerAggrCopies();
+}
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower MemTransferInst or load-store pair to loop
+static void convertTransferToLoop(Instruction *splitAt, Value *srcAddr,
+                                  Value *dstAddr, Value *len,
+                                  //unsigned numLoads,
+                                  bool srcVolatile, bool dstVolatile,
+                                  LLVMContext &Context, Function &F) {
+  Type *indType = len->getType();
+
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  // srcAddr and dstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned srcAS =
+      dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace();
+  unsigned dstAS =
+      dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
+  dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+
+  IRBuilder<> loop(loopBB);
+  // The loop index (ind) is a phi node.
+  PHINode *ind = loop.CreatePHI(indType, 0);
+  // Incoming value for ind is 0
+  ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+
+  // load from srcAddr+ind
+  Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile);
+  // store at dstAddr+ind
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile);
+
+  // The value for ind coming from backedge is (ind + 1)
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+// Lower MemSetInst to loop
+static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
+                                Value *len, Value *val, LLVMContext &Context,
+                                Function &F) {
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  unsigned dstAS =
+      dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointer to the type of value getting stored
+  dstAddr = builder.CreateBitCast(dstAddr,
+                                  PointerType::get(val->getType(), dstAS));
+
+  IRBuilder<> loop(loopBB);
+  PHINode *ind = loop.CreatePHI(len->getType(), 0);
+  ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false);
+
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<LoadInst *, 4> aggrLoads;
+  SmallVector<MemTransferInst *, 4> aggrMemcpys;
+  SmallVector<MemSetInst *, 4> aggrMemsets;
+
+  TargetData *TD = &getAnalysis<TargetData>();
+  LLVMContext &Context = F.getParent()->getContext();
+
+  //
+  // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
+  //
+  //const BasicBlock *firstBB = &F.front();  // first BB in F
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    //BasicBlock *bb = BI;
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+        ++II) {
+      if (LoadInst * load = dyn_cast<LoadInst>(II)) {
+
+        if (load->hasOneUse() == false) continue;
+
+        if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize) continue;
+
+        User *use = *(load->use_begin());
+        if (StoreInst * store = dyn_cast<StoreInst>(use)) {
+          if (store->getOperand(0) != load) //getValueOperand
+          continue;
+          aggrLoads.push_back(load);
+        }
+      } else if (MemTransferInst * intr = dyn_cast<MemTransferInst>(II)) {
+        Value *len = intr->getLength();
+        // If the number of elements being copied is greater
+        // than MaxAggrCopySize, lower it to a loop
+        if (ConstantInt * len_int = dyn_cast < ConstantInt > (len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemcpys.push_back(intr);
+          }
+        } else {
+          // turn variable length memcpy/memmov into loop
+          aggrMemcpys.push_back(intr);
+        }
+      } else if (MemSetInst * memsetintr = dyn_cast<MemSetInst>(II)) {
+        Value *len = memsetintr->getLength();
+        if (ConstantInt * len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemsets.push_back(memsetintr);
+          }
+        } else {
+          // turn variable length memset into loop
+          aggrMemsets.push_back(memsetintr);
+        }
+      }
+    }
+  }
+  if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0)
+      && (aggrMemsets.size() == 0)) return false;
+
+  //
+  // Do the transformation of an aggr load/copy/set to a loop
+  //
+  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
+    LoadInst *load = aggrLoads[i];
+    StoreInst *store = dyn_cast<StoreInst>(*load->use_begin());
+    Value *srcAddr = load->getOperand(0);
+    Value *dstAddr = store->getOperand(1);
+    unsigned numLoads = TD->getTypeStoreSize(load->getType());
+    Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
+
+    convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
+                          store->isVolatile(), Context, F);
+
+    store->eraseFromParent();
+    load->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
+    MemTransferInst *cpy = aggrMemcpys[i];
+    Value *len = cpy->getLength();
+    // llvm 2.7 version of memcpy does not have volatile
+    // operand yet. So always making it non-volatile
+    // optimistically, so that we don't see unnecessary
+    // st.volatile in ptx
+    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
+                          false, Context, F);
+    cpy->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
+    MemSetInst *memsetinst = aggrMemsets[i];
+    Value *len = memsetinst->getLength();
+    Value *val = memsetinst->getValue();
+    convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
+                        F);
+    memsetinst->eraseFromParent();
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createLowerAggrCopies() {
+  return new NVPTXLowerAggrCopies();
+}
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@ -0,0 +1,47 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_LOWER_AGGR_COPIES_H
+#define NVPTX_LOWER_AGGR_COPIES_H
+
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+  static char ID;
+
+  NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetData>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual bool runOnFunction(Function &F);
+
+  static const unsigned MaxAggrCopySize = 128;
+
+  virtual const char *getPassName() const {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+};
+
+extern FunctionPass *createLowerAggrCopies();
+}
+
+#endif
--- a/lib/Target/NVPTX/NVPTXNumRegisters.h
+++ b/lib/Target/NVPTX/NVPTXNumRegisters.h
@ -0,0 +1,20 @@
+
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm {
+
+const unsigned NVPTXNumRegisters = 396;
+
+}
+
+#endif
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@ -0,0 +1,332 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+
+using namespace llvm;
+
+namespace llvm
+{
+std::string getNVPTXRegClassName (TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return ".f32";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return ".f64";
+  }
+  else if (RC == &NVPTX::Int64RegsRegClass) {
+    return ".s64";
+  }
+  else if (RC == &NVPTX::Int32RegsRegClass) {
+    return ".s32";
+  }
+  else if (RC == &NVPTX::Int16RegsRegClass) {
+    return ".s16";
+  }
+  // Int8Regs become 16-bit registers in PTX
+  else if (RC == &NVPTX::Int8RegsRegClass) {
+    return ".s16";
+  }
+  else if (RC == &NVPTX::Int1RegsRegClass) {
+    return ".pred";
+  }
+  else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  }
+  else if (RC == &NVPTX::V2F32RegsRegClass) {
+    return ".v2.f32";
+  }
+  else if (RC == &NVPTX::V4F32RegsRegClass) {
+    return ".v4.f32";
+  }
+  else if (RC == &NVPTX::V2I32RegsRegClass) {
+    return ".v2.s32";
+  }
+  else if (RC == &NVPTX::V4I32RegsRegClass) {
+    return ".v4.s32";
+  }
+  else if (RC == &NVPTX::V2F64RegsRegClass) {
+    return ".v2.f64";
+  }
+  else if (RC == &NVPTX::V2I64RegsRegClass) {
+    return ".v2.s64";
+  }
+  else if (RC == &NVPTX::V2I16RegsRegClass) {
+    return ".v2.s16";
+  }
+  else if (RC == &NVPTX::V4I16RegsRegClass) {
+    return ".v4.s16";
+  }
+  else if (RC == &NVPTX::V2I8RegsRegClass) {
+    return ".v2.s16";
+  }
+  else if (RC == &NVPTX::V4I8RegsRegClass) {
+    return ".v4.s16";
+  }
+  else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+std::string getNVPTXRegClassStr (TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return "%f";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return "%fd";
+  }
+  else if (RC == &NVPTX::Int64RegsRegClass) {
+    return "%rd";
+  }
+  else if (RC == &NVPTX::Int32RegsRegClass) {
+    return "%r";
+  }
+  else if (RC == &NVPTX::Int16RegsRegClass) {
+    return "%rs";
+  }
+  else if (RC == &NVPTX::Int8RegsRegClass) {
+    return "%rc";
+  }
+  else if (RC == &NVPTX::Int1RegsRegClass) {
+    return "%p";
+  }
+  else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  }
+  else if (RC == &NVPTX::V2F32RegsRegClass) {
+    return "%v2f";
+  }
+  else if (RC == &NVPTX::V4F32RegsRegClass) {
+    return "%v4f";
+  }
+  else if (RC == &NVPTX::V2I32RegsRegClass) {
+    return "%v2r";
+  }
+  else if (RC == &NVPTX::V4I32RegsRegClass) {
+    return "%v4r";
+  }
+  else if (RC == &NVPTX::V2F64RegsRegClass) {
+    return "%v2fd";
+  }
+  else if (RC == &NVPTX::V2I64RegsRegClass) {
+    return "%v2rd";
+  }
+  else if (RC == &NVPTX::V2I16RegsRegClass) {
+    return "%v2s";
+  }
+  else if (RC == &NVPTX::V4I16RegsRegClass) {
+    return "%v4rs";
+  }
+  else if (RC == &NVPTX::V2I8RegsRegClass) {
+    return "%v2rc";
+  }
+  else if (RC == &NVPTX::V4I8RegsRegClass) {
+    return "%v4rc";
+  }
+  else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+bool isNVPTXVectorRegClass(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return true;
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return true;
+  return false;
+}
+
+std::string getNVPTXElemClassName(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
+  assert(0 && "Not a vector register class");
+  return "Unsupported";
+}
+
+const TargetRegisterClass *getNVPTXElemClass(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return (&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return (&NVPTX::Float64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return (&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return (&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return (&NVPTX::Int64RegsRegClass);
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return (&NVPTX::Int8RegsRegClass);
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return (&NVPTX::Float32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return (&NVPTX::Int16RegsRegClass);
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return (&NVPTX::Int32RegsRegClass);
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return (&NVPTX::Int8RegsRegClass);
+  assert(0 && "Not a vector register class");
+  return 0;
+}
+
+int getNVPTXVectorSize(TargetRegisterClass const *RC) {
+  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
+    return 2;
+  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
+    return 4;
+  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
+    return 4;
+  assert(0 && "Not a vector register class");
+  return -1;
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                                     const NVPTXSubtarget &st)
+: NVPTXGenRegisterInfo(0),
+  TII(tii),
+  ST(st) {
+  Is64Bit = st.is64Bit();
+}
+
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const uint16_t* NVPTXRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const uint16_t CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+// NVPTX Callee Saved Reg Classes
+const TargetRegisterClass* const*
+NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+void NVPTXRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II,
+                    int SPAdj,
+                    RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+      MI.getOperand(i+1).getImm();
+
+  // Using I0 as the frame pointer
+  MI.getOperand(i).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+}
+
+
+int NVPTXRegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return 0;
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return NVPTX::VRFrame;
+}
+
+unsigned NVPTXRegisterInfo::getRARegister() const {
+  return 0;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@ -0,0 +1,94 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXREGISTERINFO_H
+#define NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+namespace llvm {
+
+// Forward Declarations.
+class TargetInstrInfo;
+class NVPTXSubtarget;
+
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+  const NVPTXSubtarget &ST;
+  bool Is64Bit;
+  // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+  ManagedStringPool     ManagedStrPool;
+
+public:
+  NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                    const NVPTXSubtarget &st);
+
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // NVPTX callee saved registers
+  virtual const uint16_t*
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  // NVPTX callee saved register classes
+  virtual const TargetRegisterClass* const *
+  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                   int SPAdj,
+                                   RegScavenger *RS=NULL) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
+  virtual unsigned getRARegister() const;
+
+  ManagedStringPool *getStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  const char *getName(unsigned RegNo) const {
+    std::stringstream O;
+    O << "reg" << RegNo;
+    return getStrPool()->getManagedString(O.str().c_str())->c_str();
+  }
+
+};
+
+
+std::string getNVPTXRegClassName (const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr (const TargetRegisterClass *RC);
+bool isNVPTXVectorRegClass (const TargetRegisterClass *RC);
+std::string getNVPTXElemClassName (const TargetRegisterClass *RC);
+int getNVPTXVectorSize (const TargetRegisterClass *RC);
+const TargetRegisterClass *getNVPTXElemClass(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+
+#endif
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@ -0,0 +1,45 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_NVPTXSECTION_H
+#define LLVM_NVPTXSECTION_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/GlobalVariable.h"
+#include <vector>
+
+namespace llvm {
+/// NVPTXSection - Represents a section in PTX
+/// PTX does not have sections. We create this class in order to use
+/// the ASMPrint interface.
+///
+class NVPTXSection : public MCSection {
+
+public:
+  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
+  ~NVPTXSection() {};
+
+  /// Override this as NVPTX has its own way of printing switching
+  /// to a section.
+  virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                    raw_ostream &OS) const {}
+
+  /// Base address of PTX sections is zero.
+  virtual bool isBaseAddressKnownZero() const { return true; }
+  virtual bool UseCodeAlign() const { return false; }
+  virtual bool isVirtualSection() const { return false; }
+};
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@ -0,0 +1,77 @@
+//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier  --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Split basic blocks so that a basic block that contains a barrier instruction
+// only contains the barrier instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/InstIterator.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXSplitBBatBar.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createSplitBBatBarPass();
+}
+
+char NVPTXSplitBBatBar::ID = 0;
+
+bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
+
+  SmallVector<Instruction *, 4> SplitPoints;
+  bool changed = false;
+
+  // Collect all the split points in SplitPoints
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    BasicBlock::iterator IB = BI->begin();
+    BasicBlock::iterator II = IB;
+    BasicBlock::iterator IE = BI->end();
+
+    // Skit the first intruction. No splitting is needed at this
+    // point even if this is a bar.
+    while (II != IE) {
+      if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
+        Intrinsic::ID id = inst->getIntrinsicID();
+        // If this is a barrier, split at this instruction
+        // and the next instruction.
+        if (llvm::isBarrierIntrinsic(id)) {
+          if (II != IB)
+            SplitPoints.push_back(II);
+          II++;
+          if ((II != IE) && (!II->isTerminator())) {
+            SplitPoints.push_back(II);
+            II++;
+          }
+          continue;
+        }
+      }
+      II++;
+    }
+  }
+
+  for (unsigned i = 0; i != SplitPoints.size(); i++) {
+    changed = true;
+    Instruction *inst = SplitPoints[i];
+    inst->getParent()->splitBasicBlock(inst, "bar_split");
+  }
+
+  return changed;
+}
+
+// This interface will most likely not be necessary, because this pass will
+// not be invoked by the driver, but will be used as a prerequisite to
+// another pass.
+FunctionPass *llvm::createSplitBBatBarPass() {
+  return new NVPTXSplitBBatBar();
+}
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.h
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.h
@ -0,0 +1,41 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific declarations
+// for splitting basic blocks at barrier instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_SPLIT_BB_AT_BAR_H
+#define NVPTX_SPLIT_BB_AT_BAR_H
+
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXSplitBBatBar : public FunctionPass {
+  static char ID;
+
+  NVPTXSplitBBatBar() : FunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+  virtual bool runOnFunction(Function &F);
+
+  virtual const char *getPassName() const {
+    return "Split basic blocks at barrier";
+  }
+};
+
+extern FunctionPass *createSplitBBatBarPass();
+}
+
+#endif //NVPTX_SPLIT_BB_AT_BAR_H
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@ -0,0 +1,57 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+// Select Driver Interface
+#include "llvm/Support/CommandLine.h"
+namespace {
+cl::opt<NVPTX::DrvInterface>
+DriverInterface(cl::desc("Choose driver interface:"),
+                cl::values(
+                    clEnumValN(NVPTX::NVCL, "drvnvcl", "Nvidia OpenCL driver"),
+                    clEnumValN(NVPTX::CUDA, "drvcuda", "Nvidia CUDA driver"),
+                    clEnumValN(NVPTX::TEST, "drvtest", "Plain Test"),
+                    clEnumValEnd),
+                    cl::init(NVPTX::NVCL));
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, bool is64Bit)
+:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget,
+ // because we don't register all
+ // nvptx targets.
+ Is64Bit(is64Bit) {
+
+  drvInterface = DriverInterface;
+
+  // Provide the default CPU if none
+  std::string defCPU = "sm_10";
+
+  // Get the TargetName from the FS if available
+  if (FS.empty() && CPU.empty())
+    TargetName = defCPU;
+  else if (!CPU.empty())
+    TargetName = CPU;
+  else
+    llvm_unreachable("we are not using FeatureStr");
+
+  // Set up the SmVersion
+  SmVersion = atoi(TargetName.c_str()+3);
+}
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@ -0,0 +1,92 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXSUBTARGET_H
+#define NVPTXSUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "NVPTX.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+#include <string>
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+
+  unsigned int SmVersion;
+  std::string TargetName;
+  NVPTX::DrvInterface drvInterface;
+  bool dummy; // For the 'dummy' feature, see NVPTX.td
+  bool Is64Bit;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool is64Bit);
+
+  bool hasBrkPt() const { return SmVersion >= 11; }
+  bool hasAtomRedG32() const { return SmVersion >= 11; }
+  bool hasAtomRedS32() const { return SmVersion >= 12; }
+  bool hasAtomRedG64() const { return SmVersion >= 12; }
+  bool hasAtomRedS64() const { return SmVersion >= 20; }
+  bool hasAtomRedGen32() const { return SmVersion >= 20; }
+  bool hasAtomRedGen64() const { return SmVersion >= 20; }
+  bool hasAtomAddF32() const { return SmVersion >= 20; }
+  bool hasVote() const { return SmVersion >= 12; }
+  bool hasDouble() const { return SmVersion >= 13; }
+  bool reqPTX20() const { return SmVersion >= 20; }
+  bool hasF32FTZ() const { return SmVersion >= 20; }
+  bool hasFMAF32() const { return SmVersion >= 20; }
+  bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDU() const { return SmVersion >= 20; }
+  bool hasGenericLdSt() const { return SmVersion >= 20; }
+  inline bool hasHWROT32() const { return false; }
+  inline bool hasSWROT32() const {
+    return true;
+  }
+  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32() ; }
+  inline bool hasROT64() const { return SmVersion >= 20; }
+
+
+  bool is64Bit() const { return Is64Bit; }
+
+  unsigned int getSmVersion() const { return SmVersion; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+  std::string getTargetName() const { return TargetName; }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit())
+      p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+    else
+      p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+
+    return std::string(p);
+  }
+
+};
+
+} // End llvm namespace
+
+#endif  // NVPTXSUBTARGET_H
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@ -0,0 +1,133 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "NVPTX.h"
+#include "NVPTXSplitBBatBar.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+
+using namespace llvm;
+
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+  // Register the target.
+  RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
+  RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
+
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
+
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(const Target &T,
+                                       StringRef TT,
+                                       StringRef CPU,
+                                       StringRef FS,
+                                       const TargetOptions& Options,
+                                       Reloc::Model RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL,
+                                       bool is64bit)
+: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+  Subtarget(TT, CPU, FS, is64bit),
+  DataLayout(Subtarget.getDataLayout()),
+  InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit)
+/*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+}
+
+
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+: NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
+
+
+namespace llvm {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+  : TargetPassConfig(TM, PM) {}
+
+  NVPTXTargetMachine &getNVPTXTargetMachine() const {
+    return getTM<NVPTXTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+};
+}
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
+  return PassConfig;
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+  PM->add(createLowerAggrCopies());
+  PM->add(createSplitBBatBarPass());
+  PM->add(createAllocaHoisting());
+  PM->add(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+  PM->add(createVectorElementizePass(getNVPTXTargetMachine()));
+  return false;
+}
+
+bool NVPTXPassConfig::addPreRegAlloc() {
+  return false;
+}
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@ -0,0 +1,131 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef NVPTX_TARGETMACHINE_H
+#define NVPTX_TARGETMACHINE_H
+
+#include "NVPTXInstrInfo.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXFrameLowering.h"
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+  NVPTXSubtarget        Subtarget;
+  const TargetData      DataLayout;       // Calculates type size & alignment
+  NVPTXInstrInfo        InstrInfo;
+  NVPTXTargetLowering   TLInfo;
+  TargetSelectionDAGInfo   TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering       FrameLowering;
+
+  // Hold Strings that can be free'd all together with NVPTXTargetMachine
+  ManagedStringPool     ManagedStrPool;
+
+  //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+  //                            bool DisableVerify, MCContext *&OutCtx);
+
+public:
+  //virtual bool addPassesToEmitFile(PassManagerBase &PM,
+  //                                 formatted_raw_ostream &Out,
+  //                                 CodeGenFileType FileType,
+  //                                 CodeGenOpt::Level OptLevel,
+  //                                 bool DisableVerify = true) ;
+
+  NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OP,
+                     bool is64bit);
+
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const NVPTXInstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget;}
+
+  virtual const NVPTXRegisterInfo *getRegisterInfo() const {
+    return &(InstrInfo.getRegisterInfo());
+  }
+
+  virtual NVPTXTargetLowering *getTargetLowering() const {
+    return const_cast<NVPTXTargetLowering*>(&TLInfo);
+  }
+
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  //virtual bool addInstSelector(PassManagerBase &PM,
+  //                             CodeGenOpt::Level OptLevel);
+
+  //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+
+  ManagedStringPool *getManagedStrPool() const {
+    return const_cast<ManagedStringPool*>(&ManagedStrPool);
+  }
+
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  // Emission of machine code through JITCodeEmitter is not supported.
+  virtual bool addPassesToEmitMachineCode(PassManagerBase &,
+                                          JITCodeEmitter &,
+                                          bool = true) {
+    return true;
+  }
+
+  // Emission of machine code through MCJIT is not supported.
+  virtual bool addPassesToEmitMC(PassManagerBase &,
+                                 MCContext *&,
+                                 raw_ostream &,
+                                 bool = true) {
+    return true;
+  }
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@ -0,0 +1,105 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <string>
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+  NVPTXTargetObjectFile() {};
+  ~NVPTXTargetObjectFile() {
+    delete TextSection;
+    delete DataSection;
+    delete BSSSection;
+    delete ReadOnlySection;
+
+    delete StaticCtorSection;
+    delete StaticDtorSection;
+    delete LSDASection;
+    delete EHFrameSection;
+    delete DwarfAbbrevSection;
+    delete DwarfInfoSection;
+    delete DwarfLineSection;
+    delete DwarfFrameSection;
+    delete DwarfPubTypesSection;
+    delete DwarfDebugInlineSection;
+    delete DwarfStrSection;
+    delete DwarfLocSection;
+    delete DwarfARangesSection;
+    delete DwarfRangesSection;
+    delete DwarfMacroInfoSection;
+  };
+
+  virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TextSection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getText());
+    DataSection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getDataRel());
+    BSSSection = new NVPTXSection(MCSection::SV_ELF,
+                                  SectionKind::getBSS());
+    ReadOnlySection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getReadOnly());
+
+    StaticCtorSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    StaticDtorSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    LSDASection = new NVPTXSection(MCSection::SV_ELF,
+                                   SectionKind::getMetadata());
+    EHFrameSection = new NVPTXSection(MCSection::SV_ELF,
+                                      SectionKind::getMetadata());
+    DwarfAbbrevSection = new NVPTXSection(MCSection::SV_ELF,
+                                          SectionKind::getMetadata());
+    DwarfInfoSection = new NVPTXSection(MCSection::SV_ELF,
+                                        SectionKind::getMetadata());
+    DwarfLineSection = new NVPTXSection(MCSection::SV_ELF,
+                                        SectionKind::getMetadata());
+    DwarfFrameSection = new NVPTXSection(MCSection::SV_ELF,
+                                         SectionKind::getMetadata());
+    DwarfPubTypesSection = new NVPTXSection(MCSection::SV_ELF,
+                                            SectionKind::getMetadata());
+    DwarfDebugInlineSection = new NVPTXSection(MCSection::SV_ELF,
+                                               SectionKind::getMetadata());
+    DwarfStrSection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getMetadata());
+    DwarfLocSection = new NVPTXSection(MCSection::SV_ELF,
+                                       SectionKind::getMetadata());
+    DwarfARangesSection = new NVPTXSection(MCSection::SV_ELF,
+                                           SectionKind::getMetadata());
+    DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF,
+                                          SectionKind::getMetadata());
+    DwarfMacroInfoSection = new NVPTXSection(MCSection::SV_ELF,
+                                             SectionKind::getMetadata());
+  };
+
+  virtual const MCSection *getSectionForConstant(SectionKind Kind) const {
+    return ReadOnlySection;
+  };
+
+  virtual const MCSection *
+  getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                           Mangler *Mang,
+                           const TargetMachine &TM) const {
+    return DataSection;
+  };
+
+};
+
+} // end namespace llvm
+
+#endif
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@ -0,0 +1,514 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Constants.h"
+#include "llvm/Operator.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+//#include <iostream>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InstIterator.h"
+
+using namespace llvm;
+
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+
+ManagedStatic<per_module_annot_t> annotationCache;
+
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  assert(md && "Invalid mdnode for annotation");
+  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  // start index = 1, to skip the global variable key
+  // increment = 2, to skip the value for each property-value pairs
+  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+    // property
+    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    assert(prop && "Annotation property not a string");
+
+    // value
+    ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i+1));
+    assert(Val && "Value operand not a constant int");
+
+    std::string keyname = prop->getString().str();
+    if (retval.find(keyname) != retval.end())
+      retval[keyname].push_back(Val->getZExtValue());
+    else {
+      std::vector<unsigned> tmp;
+      tmp.push_back(Val->getZExtValue());
+      retval[keyname] = tmp;
+    }
+  }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
+  if (!NMD)
+    return;
+  key_val_pair_t tmp;
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *elem = NMD->getOperand(i);
+
+    Value *entity = elem->getOperand(0);
+    // entity may be null due to DCE
+    if (!entity)
+      continue;
+    if (entity != gv)
+      continue;
+
+    // accumulate annotations for entity in tmp
+    cacheAnnotationFromMD(elem, tmp);
+  }
+
+  if (tmp.empty()) // no annotations for this gv
+    return;
+
+  if ((*annotationCache).find(m) != (*annotationCache).end())
+    (*annotationCache)[m][gv] = tmp;
+  else {
+    global_val_annot_t tmp1;
+    tmp1[gv] = tmp;
+    (*annotationCache)[m] = tmp1;
+  }
+}
+
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 unsigned &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop][0];
+  return true;
+}
+
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 std::vector<unsigned> &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop];
+  return true;
+}
+
+bool llvm::isTexture(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSurface(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSampler(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(gv,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
+    }
+  }
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageReadOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+          llvm::PropertyAnnotationNames[llvm::PROPERTY_ISREADONLY_IMAGE_PARAM],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageWriteOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+         llvm::PropertyAnnotationNames[llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM],
+                                   annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImage(const llvm::Value &val) {
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val);
+}
+
+std::string llvm::getTextureName(const llvm::Value &val) {
+  assert(val.hasName() && "Found texture variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSurfaceName(const llvm::Value &val) {
+  assert(val.hasName() && "Found surface variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSamplerName(const llvm::Value &val) {
+  assert(val.hasName() && "Found sampler variable with no name");
+  return val.getName();
+}
+
+bool llvm::getMaxNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X],
+                                      x));
+}
+
+bool llvm::getMaxNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y],
+                                      y));
+}
+
+bool llvm::getMaxNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z],
+                                      z));
+}
+
+bool llvm::getReqNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X],
+                                      x));
+}
+
+bool llvm::getReqNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y],
+                                      y));
+}
+
+bool llvm::getReqNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                       llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z],
+                                      z));
+}
+
+bool llvm::getMinCTASm(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(&F,
+                    llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM],
+                                      x));
+}
+
+bool llvm::isKernelFunction(const Function &F) {
+  unsigned x = 0;
+  bool retval = llvm::findOneNVVMAnnotation(&F,
+               llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION],
+                                            x);
+  if (retval == false) {
+    // There is no NVVM metadata, check the calling convention
+    if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel)
+      return true;
+    else
+      return false;
+  }
+  return (x==1);
+}
+
+bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) {
+  std::vector<unsigned> Vs;
+  bool retval = llvm::findAllNVVMAnnotation(&F,
+                           llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN],
+                                            Vs);
+  if (retval == false)
+    return false;
+  for (int i=0, e=Vs.size(); i<e; i++) {
+    unsigned v = Vs[i];
+    if ( (v >> 16) == index ) {
+      align =  v & 0xFFFF;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
+  if (MDNode *alignNode = I.getMetadata("callalign")) {
+    for (int i=0, n = alignNode->getNumOperands();
+        i<n; i++) {
+      if (const ConstantInt *CI =
+          dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+        unsigned v = CI->getZExtValue();
+        if ( (v>>16) == index ) {
+          align = v & 0xFFFF;
+          return true;
+        }
+        if ( (v>>16) > index ) {
+          return false;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
+  if ((id == Intrinsic::nvvm_barrier0) ||
+      (id == Intrinsic::nvvm_barrier0_popc) ||
+      (id == Intrinsic::nvvm_barrier0_and) ||
+      (id == Intrinsic::nvvm_barrier0_or) ||
+      (id == Intrinsic::cuda_syncthreads))
+    return true;
+  return false;
+}
+
+// Interface for checking all memory space transfer related intrinsics
+bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
+  if (id == Intrinsic::nvvm_ptr_local_to_gen ||
+      id == Intrinsic::nvvm_ptr_shared_to_gen ||
+      id == Intrinsic::nvvm_ptr_global_to_gen ||
+      id == Intrinsic::nvvm_ptr_constant_to_gen ||
+      id == Intrinsic::nvvm_ptr_gen_to_global ||
+      id == Intrinsic::nvvm_ptr_gen_to_shared ||
+      id == Intrinsic::nvvm_ptr_gen_to_local ||
+      id == Intrinsic::nvvm_ptr_gen_to_constant ||
+      id == Intrinsic::nvvm_ptr_gen_to_param) {
+    return true;
+  }
+
+  return false;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// provide an option to ignore GEP indicies for find out the base address only
+// which could be used in simple alias disambigurate.
+const Value *llvm::skipPointerTransfer(const Value *V,
+                                       bool ignore_GEP_indices) {
+  V = V->stripPointerCasts();
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (ignore_GEP_indices)
+      if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+        V = GEP->getPointerOperand()->stripPointerCasts();
+        continue;
+      }
+    break;
+  }
+  return V;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// - ignore GEP indicies for find out the base address only, and
+// - tracking PHINode
+// which could be used in simple alias disambigurate.
+const Value *llvm::skipPointerTransfer(const Value *V,
+                                       std::set<const Value *> &processed) {
+  if (processed.find(V) != processed.end())
+    return NULL;
+  processed.insert(V);
+
+  const Value *V2 = V->stripPointerCasts();
+  if (V2 != V && processed.find(V2) != processed.end())
+    return NULL;
+  processed.insert(V2);
+
+  V = V2;
+
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand()->stripPointerCasts();
+      continue;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (V != V2 && processed.find(V) != processed.end())
+        return NULL;
+      processed.insert(PN);
+      const Value *common = 0;
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        const Value *pv = PN->getIncomingValue(i);
+        const Value *base = skipPointerTransfer(pv, processed);
+        if (base) {
+          if (common == 0)
+            common = base;
+          else if (common != base)
+            return PN;
+        }
+      }
+      if (common == 0)
+        return PN;
+      V = common;
+    }
+    break;
+  }
+  return V;
+}
+
+
+// The following are some useful utilities for debuggung
+
+BasicBlock *llvm::getParentBlock(Value *v) {
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent();
+
+  return 0;
+}
+
+Function *llvm::getParentFunction(Value *v) {
+  if (Function *F = dyn_cast<Function>(v))
+    return F;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent()->getParent();
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B->getParent();
+
+  return 0;
+}
+
+// Dump a block by name
+void llvm::dumpBlock(Value *v, char *blockName) {
+  Function *F = getParentFunction(v);
+  if (F == 0)
+    return;
+
+  for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
+    BasicBlock *B = it;
+    if (strcmp(B->getName().data(), blockName) == 0) {
+      B->dump();
+      return;
+    }
+  }
+}
+
+// Find an instruction by name
+Instruction *llvm::getInst(Value *base, char *instName) {
+  Function *F = getParentFunction(base);
+  if (F == 0)
+    return 0;
+
+  for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
+    Instruction *I = &*it;
+    if (strcmp(I->getName().data(), instName) == 0) {
+      return I;
+    }
+  }
+
+  return 0;
+}
+
+// Dump an instruction by nane
+void llvm::dumpInst(Value *base, char *instName) {
+  Instruction *I = getInst(base, instName);
+  if (I)
+    I->dump();
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+
+    if (visited->find(I) != visited->end())
+      return;
+
+    visited->insert(I);
+
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      dumpInstRec(I->getOperand(i), visited);
+
+    I->dump();
+  }
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v) {
+  std::set<Instruction *> visited;
+
+  //BasicBlock *B = getParentBlock(v);
+
+  dumpInstRec(v, &visited);
+}
+
+// Dump the parent for Instruction, block or function
+void llvm::dumpParent(Value *v) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+    I->getParent()->dump();
+    return;
+  }
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v)) {
+    B->getParent()->dump();
+    return;
+  }
+
+  if (Function *F = dyn_cast<Function>(v)) {
+    F->getParent()->dump();
+    return;
+  }
+}
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@ -0,0 +1,94 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXUTILITIES_H
+#define NVPTXUTILITIES_H
+
+#include "llvm/Value.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm
+{
+
+#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
+#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
+
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+                           std::vector<unsigned> &);
+
+bool isTexture(const llvm::Value &);
+bool isSurface(const llvm::Value &);
+bool isSampler(const llvm::Value &);
+bool isImage(const llvm::Value &);
+bool isImageReadOnly(const llvm::Value &);
+bool isImageWriteOnly(const llvm::Value &);
+
+std::string getTextureName(const llvm::Value &);
+std::string getSurfaceName(const llvm::Value &);
+std::string getSamplerName(const llvm::Value &);
+
+bool getMaxNTIDx(const llvm::Function &, unsigned &);
+bool getMaxNTIDy(const llvm::Function &, unsigned &);
+bool getMaxNTIDz(const llvm::Function &, unsigned &);
+
+bool getReqNTIDx(const llvm::Function &, unsigned &);
+bool getReqNTIDy(const llvm::Function &, unsigned &);
+bool getReqNTIDz(const llvm::Function &, unsigned &);
+
+bool getMinCTASm(const llvm::Function &, unsigned &);
+bool isKernelFunction(const llvm::Function &);
+
+bool getAlign(const llvm::Function &, unsigned index, unsigned &);
+bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
+
+bool isBarrierIntrinsic(llvm::Intrinsic::ID);
+
+/// make_vector - Helper function which is useful for building temporary vectors
+/// to pass into type construction of CallInst ctors.  This turns a null
+/// terminated list of pointers (or other value types) into a real live vector.
+///
+template<typename T>
+inline std::vector<T> make_vector(T A, ...) {
+  va_list Args;
+  va_start(Args, A);
+  std::vector<T> Result;
+  Result.push_back(A);
+  while (T Val = va_arg(Args, T))
+    Result.push_back(Val);
+  va_end(Args);
+  return Result;
+}
+
+bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
+const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
+const Value *skipPointerTransfer(const Value *V,
+                                 std::set<const Value *> &processed);
+BasicBlock *getParentBlock(Value *v);
+Function *getParentFunction(Value *v);
+void dumpBlock(Value *v, char *blockName);
+Instruction *getInst(Value *base, char *instName);
+void dumpInst(Value *base, char *instName);
+void dumpInstRec(Value *v, std::set<Instruction *> *visited);
+void dumpInstRec(Value *v);
+void dumpParent(Value *v);
+
+}
+
+#endif
--- a/lib/Target/NVPTX/NVPTXVector.td
+++ b/lib/Target/NVPTX/NVPTXVector.td
--- a/lib/Target/NVPTX/NVPTXutil.cpp
+++ b/lib/Target/NVPTX/NVPTXutil.cpp
@ -0,0 +1,91 @@
+//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXutil.h"
+#include "NVPTX.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+bool isParamLoad(const MachineInstr *MI)
+{
+  if ((MI->getOpcode() != NVPTX::LD_i32_avar) &&
+      (MI->getOpcode() != NVPTX::LD_i64_avar))
+    return false;
+  if (MI->getOperand(2).isImm() == false)
+    return false;
+  if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM)
+    return false;
+  return true;
+}
+
+#define DATA_MASK     0x7f
+#define DIGIT_WIDTH   7
+#define MORE_BYTES    0x80
+
+static int encode_leb128(uint64_t val, int *nbytes,
+                         char *space, int splen)
+{
+  char *a;
+  char *end = space + splen;
+
+  a = space;
+  do {
+    unsigned char uc;
+
+    if (a >= end)
+      return 1;
+    uc = val & DATA_MASK;
+    val >>= DIGIT_WIDTH;
+    if (val != 0)
+      uc |= MORE_BYTES;
+    *a = uc;
+    a++;
+  } while (val);
+  *nbytes = a - space;
+  return 0;
+}
+
+#undef DATA_MASK
+#undef DIGIT_WIDTH
+#undef MORE_BYTES
+
+uint64_t encode_leb128(const char *str)
+{
+  union { uint64_t x; char a[8]; } temp64;
+
+  temp64.x = 0;
+
+  for (unsigned i=0,e=strlen(str); i!=e; ++i)
+    temp64.a[i] = str[e-1-i];
+
+  char encoded[16];
+  int nbytes;
+
+  int retval = encode_leb128(temp64.x, &nbytes, encoded, 16);
+
+  assert(retval == 0 &&
+         "Encoding to leb128 failed");
+
+  assert(nbytes <= 8 &&
+         "Cannot support register names with leb128 encoding > 8 bytes");
+
+  temp64.x = 0;
+  for (int i=0; i<nbytes; ++i)
+    temp64.a[i] = encoded[i];
+
+  return temp64.x;
+}
+
+} // end namespace llvm
--- a/lib/Target/NVPTX/NVPTXutil.h
+++ b/lib/Target/NVPTX/NVPTXutil.h
@ -0,0 +1,25 @@
+//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_UTIL_H
+#define LLVM_TARGET_NVPTX_UTIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+bool isParamLoad(const MachineInstr *);
+uint64_t encode_leb128(const char *str);
+}
+
+#endif
--- a/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
+++ b/lib/Target/NVPTX/TargetInfo/CMakeLists.txt
@ -0,0 +1,7 @@
+#include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMNVPTXInfo
+  NVPTXTargetInfo.cpp
+  )
+
+add_dependencies(LLVMNVPTXInfo NVPTXCommonTableGen)
--- a/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/NVPTX/TargetInfo/LLVMBuild.txt
@ -0,0 +1,23 @@
+;===- ./lib/Target/NVPTX/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = NVPTXInfo
+parent = NVPTX
+required_libraries = MC Support Target
+add_to_library_groups = NVPTX
--- a/lib/Target/NVPTX/TargetInfo/Makefile
+++ b/lib/Target/NVPTX/TargetInfo/Makefile
@ -0,0 +1,15 @@
+##===- lib/Target/NVPTX/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMNVPTXInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
--- a/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
+++ b/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@ -0,0 +1,23 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheNVPTXTarget32;
+Target llvm::TheNVPTXTarget64;
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+  RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx",
+    "NVIDIA PTX 32-bit");
+  RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64",
+    "NVIDIA PTX 64-bit");
+}
--- a/lib/Target/NVPTX/VectorElementize.cpp
+++ b/lib/Target/NVPTX/VectorElementize.cpp
--- a/lib/Target/NVPTX/cl_common_defines.h
+++ b/lib/Target/NVPTX/cl_common_defines.h
@ -0,0 +1,125 @@
+#ifndef __CL_COMMON_DEFINES_H__
+#define __CL_COMMON_DEFINES_H__
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,            // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1,           // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,            // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3,           // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,            // four channel RGBA half
+  CLK_FLOAT = 0x10DE,                 // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 =  0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888 ,         // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R,        // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4,         // number of bits required to
+                                                // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = ( 1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS ) - 1
+}clk_channel_type;
+
+typedef enum clk_sampler_type {
+    __CLK_ADDRESS_BASE             = 0,
+    CLK_ADDRESS_NONE               = 0 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_CLAMP              = 1 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_CLAMP_TO_EDGE      = 2 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_REPEAT             = 3 << __CLK_ADDRESS_BASE,
+    CLK_ADDRESS_MIRROR             = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+    CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
+#endif
+    __CLK_ADDRESS_MASK             = CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+                                     CLK_ADDRESS_CLAMP_TO_EDGE |
+                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+    __CLK_ADDRESS_BITS             = 3,        // number of bits required to
+                                               // represent address info
+
+    __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
+    CLK_NORMALIZED_COORDS_FALSE    = 0,
+    CLK_NORMALIZED_COORDS_TRUE     = 1 << __CLK_NORMALIZED_BASE,
+    __CLK_NORMALIZED_MASK          = CLK_NORMALIZED_COORDS_FALSE |
+                                     CLK_NORMALIZED_COORDS_TRUE,
+    __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
+                                               // represent normalization
+
+    __CLK_FILTER_BASE              = __CLK_NORMALIZED_BASE +
+                                     __CLK_NORMALIZED_BITS,
+    CLK_FILTER_NEAREST             = 0 << __CLK_FILTER_BASE,
+    CLK_FILTER_LINEAR              = 1 << __CLK_FILTER_BASE,
+    CLK_FILTER_ANISOTROPIC         = 2 << __CLK_FILTER_BASE,
+    __CLK_FILTER_MASK              = CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+                                     CLK_FILTER_ANISOTROPIC,
+    __CLK_FILTER_BITS              = 2,        // number of bits required to
+                                               // represent address info
+
+    __CLK_MIP_BASE                 = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+    CLK_MIP_NEAREST                = 0 << __CLK_MIP_BASE,
+    CLK_MIP_LINEAR                 = 1 << __CLK_MIP_BASE,
+    CLK_MIP_ANISOTROPIC            = 2 << __CLK_MIP_BASE,
+    __CLK_MIP_MASK                 = CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+                                     CLK_MIP_ANISOTROPIC,
+    __CLK_MIP_BITS                 = 2,
+
+    __CLK_SAMPLER_BITS             = __CLK_MIP_BASE + __CLK_MIP_BITS,
+    __CLK_SAMPLER_MASK             = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+                                     __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+    __CLK_ANISOTROPIC_RATIO_BITS   = 5,
+    __CLK_ANISOTROPIC_RATIO_MASK   = (int) 0x80000000 >>
+                                      (__CLK_ANISOTROPIC_RATIO_BITS-1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE     (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE    (1 << 1)
+
+#endif // __CL_COMMON_DEFINES_H__
--- a/lib/Target/NVPTX/gen-register-defs.py
+++ b/lib/Target/NVPTX/gen-register-defs.py
@ -0,0 +1,202 @@
+#!/usr/bin/env python
+
+num_regs = 396
+
+outFile = open('NVPTXRegisterInfo.td', 'w')
+
+outFile.write('''
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+''')
+
+# Predicates
+outFile.write('''
+//===--- Predicate --------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def P%d : NVPTXReg<"%%p%d">;\n' % (i, i))
+
+# Int8
+outFile.write('''
+//===--- 8-bit ------------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RC%d : NVPTXReg<"%%rc%d">;\n' % (i, i))
+
+# Int16
+outFile.write('''
+//===--- 16-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RS%d : NVPTXReg<"%%rs%d">;\n' % (i, i))
+
+# Int32
+outFile.write('''
+//===--- 32-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def R%d : NVPTXReg<"%%r%d">;\n' % (i, i))
+
+# Int64
+outFile.write('''
+//===--- 64-bit -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def RL%d : NVPTXReg<"%%rl%d">;\n' % (i, i))
+
+# F32
+outFile.write('''
+//===--- 32-bit float -----------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def F%d : NVPTXReg<"%%f%d">;\n' % (i, i))
+
+# F64
+outFile.write('''
+//===--- 64-bit float -----------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def FL%d : NVPTXReg<"%%fl%d">;\n' % (i, i))
+
+# Vector registers
+outFile.write('''
+//===--- Vector -----------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def v2b8_%d : NVPTXReg<"%%v2b8_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b16_%d : NVPTXReg<"%%v2b16_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b32_%d : NVPTXReg<"%%v2b32_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v2b64_%d : NVPTXReg<"%%v2b64_%d">;\n' % (i, i))
+
+for i in range(0, num_regs):
+  outFile.write('def v4b8_%d : NVPTXReg<"%%v4b8_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v4b16_%d : NVPTXReg<"%%v4b16_%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def v4b32_%d : NVPTXReg<"%%v4b32_%d">;\n' % (i, i))
+
+# Argument registers
+outFile.write('''
+//===--- Arguments --------------------------------------------------------===//
+''')
+for i in range(0, num_regs):
+  outFile.write('def ia%d : NVPTXReg<"%%ia%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def la%d : NVPTXReg<"%%la%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def fa%d : NVPTXReg<"%%fa%d">;\n' % (i, i))
+for i in range(0, num_regs):
+  outFile.write('def da%d : NVPTXReg<"%%da%d">;\n' % (i, i))
+
+outFile.write('''
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+''')
+
+outFile.write('def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%%u", 0, %d))>;\n' % (num_regs-1))
+outFile.write('def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%%u", 0, %d))>;\n' % (num_regs-1))
+
+outFile.write('''
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
+''')
+
+outFile.write('''
+class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
+                       NVPTXRegClass sClass,
+                       int e,
+                       string n>
+  : NVPTXRegClass<regTypes, alignment, regList>
+{
+  NVPTXRegClass scalarClass=sClass;
+  int elems=e;
+  string name=n;
+}
+''')
+
+
+outFile.write('def V2F32Regs\n  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Float32Regs, 2, ".v2.f32">;\n' % (num_regs-1))
+outFile.write('def V4F32Regs\n  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Float32Regs, 4, ".v4.f32">;\n' % (num_regs-1))
+
+outFile.write('def V2I32Regs\n  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Int32Regs, 2, ".v2.u32">;\n' % (num_regs-1))
+outFile.write('def V4I32Regs\n  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Int32Regs, 4, ".v4.u32">;\n' % (num_regs-1))
+
+outFile.write('def V2F64Regs\n  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Float64Regs, 2, ".v2.f64">;\n' % (num_regs-1))
+outFile.write('def V2I64Regs\n  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Int64Regs, 2, ".v2.u64">;\n' % (num_regs-1))
+
+outFile.write('def V2I16Regs\n  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%%u", 0, %d)),\n    Int16Regs, 2, ".v2.u16">;\n' % (num_regs-1))
+outFile.write('def V4I16Regs\n  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%%u", 0, %d)),\n    Int16Regs, 4, ".v4.u16">;\n' % (num_regs-1))
+
+outFile.write('def V2I8Regs\n  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%%u", 0, %d)),\n    Int8Regs, 2, ".v2.u8">;\n' % (num_regs-1))
+outFile.write('def V4I8Regs\n  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%%u", 0, %d)),\n    Int8Regs, 4, ".v4.u8">;\n' % (num_regs-1))
+
+outFile.close()
+
+
+outFile = open('NVPTXNumRegisters.h', 'w')
+outFile.write('''
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm {
+
+const unsigned NVPTXNumRegisters = %d;
+
+}
+
+#endif
+''' % num_regs)
+
+outFile.close()
--- a/projects/sample/autoconf/configure.ac
+++ b/projects/sample/autoconf/configure.ac
@ -310,6 +310,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
  hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
  mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
  *)                      llvm_cv_target_arch="Unknown" ;;
 esac])

@ -457,6 +458,7 @@ else
    Hexagon)     AC_SUBST(TARGET_HAS_JIT,0) ;;
    MBlaze)      AC_SUBST(TARGET_HAS_JIT,0) ;;
    PTX)         AC_SUBST(TARGET_HAS_JIT,0) ;;
+    NVPTX)       AC_SUBST(TARGET_HAS_JIT,0) ;;
    *)           AC_SUBST(TARGET_HAS_JIT,0) ;;
  esac
 fi
@ -567,13 +569,13 @@ TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
    [Build specific host targets: all or target1,target2,... Valid targets are:
     host, x86, x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-     xcore, msp430, ptx, cbe, and cpp (default=all)]),,
+     xcore, msp430, ptx, nvptx, cbe, and cpp (default=all)]),,
    enableval=all)
 if test "$enableval" = host-only ; then
  enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze PTX" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze PTX NVPTX" ;;
  *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
      case "$a_target" in
        x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -589,6 +591,7 @@ case "$enableval" in
        cpp)      TARGETS_TO_BUILD="CppBackend $TARGETS_TO_BUILD" ;;
        mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
        host) case "$llvm_cv_target_arch" in
            x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
            x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -602,6 +605,7 @@ case "$enableval" in
            MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
            Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
            *)       AC_MSG_ERROR([Can not set target to build]) ;;
          esac ;;
        *) AC_MSG_ERROR([Unrecognized target $a_target]) ;;
--- a/projects/sample/configure
+++ b/projects/sample/configure
@ -1402,7 +1402,8 @@ Optional Features:
  --enable-targets        Build specific host targets: all or
                          target1,target2,... Valid targets are: host, x86,
                          x86_64, sparc, powerpc, arm, mips, spu, hexagon,
-                          xcore, msp430, ptx, cbe, and cpp (default=all)
+                          xcore, msp430, ptx, nvptx, cbe, and cpp
+                          (default=all)
  --enable-bindings       Build specific language bindings:
                          all,auto,none,{binding-name} (default=auto)
  --enable-libffi         Check for the presence of libffi (default is NO)
@ -3846,6 +3847,7 @@ else
  hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
  mblaze-*)               llvm_cv_target_arch="MBlaze" ;;
  ptx-*)                  llvm_cv_target_arch="PTX" ;;
+  nvptx-*)                llvm_cv_target_arch="NVPTX" ;;
  *)                      llvm_cv_target_arch="Unknown" ;;
 esac
 fi
@ -5069,6 +5071,8 @@ else
    MBlaze)      TARGET_HAS_JIT=0
 ;;
    PTX)         TARGET_HAS_JIT=0
+ ;;
+    NVPTX)       TARGET_HAS_JIT=0
 ;;
    *)           TARGET_HAS_JIT=0
 ;;
@ -5254,7 +5258,7 @@ if test "$enableval" = host-only ; then
  enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze PTX" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips CellSPU XCore MSP430 Hexagon CppBackend MBlaze PTX NVPTX" ;;
  *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
      case "$a_target" in
        x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -5270,6 +5274,7 @@ case "$enableval" in
        cpp)      TARGETS_TO_BUILD="CppBackend $TARGETS_TO_BUILD" ;;
        mblaze)   TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
        ptx)      TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+        nvptx)    TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
        host) case "$llvm_cv_target_arch" in
            x86)         TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
            x86_64)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@ -5283,6 +5288,7 @@ case "$enableval" in
            MSP430)      TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
            Hexagon)     TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;;
            PTX)         TARGETS_TO_BUILD="PTX $TARGETS_TO_BUILD" ;;
+            NVPTX)       TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;;
            *)       { { echo "$as_me:$LINENO: error: Can not set target to build" >&5
 echo "$as_me: error: Can not set target to build" >&2;}
   { (exit 1); exit 1; }; } ;;
@ -10307,7 +10313,7 @@ else
  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  lt_status=$lt_dlunknown
  cat > conftest.$ac_ext <<EOF
-#line 10303 "configure"
+#line 10316 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
--- a/test/CodeGen/NVPTX/annotations.ll
+++ b/test/CodeGen/NVPTX/annotations.ll
@ -0,0 +1,55 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+@texture = internal addrspace(1) global i64 0, align 8
+; CHECK: .global .texref texture
+@surface = internal addrspace(1) global i64 0, align 8
+; CHECK: .global .surfref surface
+
+
+; CHECK: .entry kernel_func_maxntid
+define void @kernel_func_maxntid(float* %a) {
+; CHECK: .maxntid 10, 20, 30
+; CHECK: ret
+  ret void
+}
+
+; CHECK: .entry kernel_func_reqntid
+define void @kernel_func_reqntid(float* %a) {
+; CHECK: .reqntid 11, 22, 33
+; CHECK: ret
+  ret void
+}
+
+; CHECK: .entry kernel_func_minctasm
+define void @kernel_func_minctasm(float* %a) {
+; CHECK: .minnctapersm 42
+; CHECK: ret
+  ret void
+}
+
+
+
+!nvvm.annotations = !{!1, !2, !3, !4, !5, !6, !7, !8}
+
+!1 = metadata !{void (float*)* @kernel_func_maxntid, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*)* @kernel_func_maxntid,
+                metadata !"maxntidx", i32 10,
+                metadata !"maxntidy", i32 20,
+                metadata !"maxntidz", i32 30}
+
+!3 = metadata !{void (float*)* @kernel_func_reqntid, metadata !"kernel", i32 1}
+!4 = metadata !{void (float*)* @kernel_func_reqntid,
+                metadata !"reqntidx", i32 11,
+                metadata !"reqntidy", i32 22,
+                metadata !"reqntidz", i32 33}
+
+!5 = metadata !{void (float*)* @kernel_func_minctasm, metadata !"kernel", i32 1}
+!6 = metadata !{void (float*)* @kernel_func_minctasm,
+                metadata !"minctasm", i32 42}
+
+!7 = metadata !{i64 addrspace(1)* @texture, metadata !"texture", i32 1}
+!8 = metadata !{i64 addrspace(1)* @surface, metadata !"surface", i32 1}
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm10.ll
@ -0,0 +1,72 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; f64
+
+define double @fadd_f64(double %a, double %b) {
+; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd double %a, %b
+  ret double %ret
+}
+
+define double @fsub_f64(double %a, double %b) {
+; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub double %a, %b
+  ret double %ret
+}
+
+define double @fmul_f64(double %a, double %b) {
+; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul double %a, %b
+  ret double %ret
+}
+
+define double @fdiv_f64(double %a, double %b) {
+; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv double %a, %b
+  ret double %ret
+}
+
+;; PTX does not have a floating-point rem instruction
+
+
+;;; f32
+
+define float @fadd_f32(float %a, float %b) {
+; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd float %a, %b
+  ret float %ret
+}
+
+define float @fsub_f32(float %a, float %b) {
+; CHECK: sub.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub float %a, %b
+  ret float %ret
+}
+
+define float @fmul_f32(float %a, float %b) {
+; CHECK: mul.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul float %a, %b
+  ret float %ret
+}
+
+define float @fdiv_f32(float %a, float %b) {
+; CHECK: div.full.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv float %a, %b
+  ret float %ret
+}
+
+;; PTX does not have a floating-point rem instruction
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@ -0,0 +1,72 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; f64
+
+define double @fadd_f64(double %a, double %b) {
+; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd double %a, %b
+  ret double %ret
+}
+
+define double @fsub_f64(double %a, double %b) {
+; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub double %a, %b
+  ret double %ret
+}
+
+define double @fmul_f64(double %a, double %b) {
+; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul double %a, %b
+  ret double %ret
+}
+
+define double @fdiv_f64(double %a, double %b) {
+; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv double %a, %b
+  ret double %ret
+}
+
+;; PTX does not have a floating-point rem instruction
+
+
+;;; f32
+
+define float @fadd_f32(float %a, float %b) {
+; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fadd float %a, %b
+  ret float %ret
+}
+
+define float @fsub_f32(float %a, float %b) {
+; CHECK: sub.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fsub float %a, %b
+  ret float %ret
+}
+
+define float @fmul_f32(float %a, float %b) {
+; CHECK: mul.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fmul float %a, %b
+  ret float %ret
+}
+
+define float @fdiv_f32(float %a, float %b) {
+; CHECK: div.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}
+; CHECK: ret
+  %ret = fdiv float %a, %b
+  ret float %ret
+}
+
+;; PTX does not have a floating-point rem instruction
--- a/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll
@ -0,0 +1,295 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; i64
+
+define i64 @add_i64(i64 %a, i64 %b) {
+; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = add i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @sub_i64(i64 %a, i64 %b) {
+; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = sub i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @mul_i64(i64 %a, i64 %b) {
+; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @sdiv_i64(i64 %a, i64 %b) {
+; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @udiv_i64(i64 %a, i64 %b) {
+; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @srem_i64(i64 %a, i64 %b) {
+; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @urem_i64(i64 %a, i64 %b) {
+; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @and_i64(i64 %a, i64 %b) {
+; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = and i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @or_i64(i64 %a, i64 %b) {
+; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = or i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @xor_i64(i64 %a, i64 %b) {
+; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %ret = xor i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @shl_i64(i64 %a, i64 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = shl i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @ashr_i64(i64 %a, i64 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = ashr i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @lshr_i64(i64 %a, i64 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = lshr i64 %a, %b
+  ret i64 %ret
+}
+
+
+;;; i32
+
+define i32 @add_i32(i32 %a, i32 %b) {
+; CHECK: add.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = add i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @sub_i32(i32 %a, i32 %b) {
+; CHECK: sub.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = sub i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @mul_i32(i32 %a, i32 %b) {
+; CHECK: mul.lo.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = mul i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @sdiv_i32(i32 %a, i32 %b) {
+; CHECK: div.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = sdiv i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @udiv_i32(i32 %a, i32 %b) {
+; CHECK: div.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = udiv i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @srem_i32(i32 %a, i32 %b) {
+; CHECK: rem.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = srem i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @urem_i32(i32 %a, i32 %b) {
+; CHECK: rem.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = urem i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @and_i32(i32 %a, i32 %b) {
+; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = and i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @or_i32(i32 %a, i32 %b) {
+; CHECK: or.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = or i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @xor_i32(i32 %a, i32 %b) {
+; CHECK: xor.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = xor i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @shl_i32(i32 %a, i32 %b) {
+; CHECK: shl.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = shl i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @ashr_i32(i32 %a, i32 %b) {
+; CHECK: shr.s32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = ashr i32 %a, %b
+  ret i32 %ret
+}
+
+define i32 @lshr_i32(i32 %a, i32 %b) {
+; CHECK: shr.u32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = lshr i32 %a, %b
+  ret i32 %ret
+}
+
+;;; i16
+
+define i16 @add_i16(i16 %a, i16 %b) {
+; CHECK: add.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = add i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @sub_i16(i16 %a, i16 %b) {
+; CHECK: sub.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = sub i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @mul_i16(i16 %a, i16 %b) {
+; CHECK: mul.lo.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = mul i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @sdiv_i16(i16 %a, i16 %b) {
+; CHECK: div.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = sdiv i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @udiv_i16(i16 %a, i16 %b) {
+; CHECK: div.u16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = udiv i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @srem_i16(i16 %a, i16 %b) {
+; CHECK: rem.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = srem i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @urem_i16(i16 %a, i16 %b) {
+; CHECK: rem.u16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = urem i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @and_i16(i16 %a, i16 %b) {
+; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = and i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @or_i16(i16 %a, i16 %b) {
+; CHECK: or.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = or i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @xor_i16(i16 %a, i16 %b) {
+; CHECK: xor.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %ret = xor i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @shl_i16(i16 %a, i16 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shl.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = shl i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @ashr_i16(i16 %a, i16 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.s16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = ashr i16 %a, %b
+  ret i16 %ret
+}
+
+define i16 @lshr_i16(i16 %a, i16 %b) {
+; PTX requires 32-bit shift amount
+; CHECK: shr.u16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %ret = lshr i16 %a, %b
+  ret i16 %ret
+}
--- a/test/CodeGen/NVPTX/calling-conv.ll
+++ b/test/CodeGen/NVPTX/calling-conv.ll
@ -0,0 +1,32 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+;; Kernel function using ptx_kernel calling conv
+
+; CHECK: .entry kernel_func
+define ptx_kernel void @kernel_func(float* %a) {
+; CHECK: ret
+  ret void
+}
+
+;; Device function
+; CHECK: .func device_func
+define void @device_func(float* %a) {
+; CHECK: ret
+  ret void
+}
+
+;; Kernel function using NVVM metadata
+; CHECK: .entry metadata_kernel
+define void @metadata_kernel(float* %a) {
+; CHECK: ret
+  ret void
+}
+
+
+!nvvm.annotations = !{!1}
+
+!1 = metadata !{void (float*)* @metadata_kernel, metadata !"kernel", i32 1}
--- a/test/CodeGen/NVPTX/compare-int.ll
+++ b/test/CodeGen/NVPTX/compare-int.ll
@ -0,0 +1,389 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+;; These tests should run for all targets
+
+;;===-- Basic instruction selection tests ---------------------------------===;;
+
+
+;;; i64
+
+define i64 @icmp_eq_i64(i64 %a, i64 %b) {
+; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ne_i64(i64 %a, i64 %b) {
+; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
+; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_uge_i64(i64 %a, i64 %b) {
+; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ult_i64(i64 %a, i64 %b) {
+; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_ule_i64(i64 %a, i64 %b) {
+; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
+; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_sge_i64(i64 %a, i64 %b) {
+; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_slt_i64(i64 %a, i64 %b) {
+; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+define i64 @icmp_sle_i64(i64 %a, i64 %b) {
+; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i64 %a, %b
+  %ret = zext i1 %cmp to i64
+  ret i64 %ret
+}
+
+;;; i32
+
+define i32 @icmp_eq_i32(i32 %a, i32 %b) {
+; CHECK: setp.eq.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ne_i32(i32 %a, i32 %b) {
+; CHECK: setp.ne.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ugt_i32(i32 %a, i32 %b) {
+; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_uge_i32(i32 %a, i32 %b) {
+; CHECK: setp.ge.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ult_i32(i32 %a, i32 %b) {
+; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_ule_i32(i32 %a, i32 %b) {
+; CHECK: setp.le.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_sgt_i32(i32 %a, i32 %b) {
+; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_sge_i32(i32 %a, i32 %b) {
+; CHECK: setp.ge.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_slt_i32(i32 %a, i32 %b) {
+; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+define i32 @icmp_sle_i32(i32 %a, i32 %b) {
+; CHECK: setp.le.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i32 %a, %b
+  %ret = zext i1 %cmp to i32
+  ret i32 %ret
+}
+
+
+;;; i16
+
+define i16 @icmp_eq_i16(i16 %a, i16 %b) {
+; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ne_i16(i16 %a, i16 %b) {
+; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ugt_i16(i16 %a, i16 %b) {
+; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_uge_i16(i16 %a, i16 %b) {
+; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ult_i16(i16 %a, i16 %b) {
+; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_ule_i16(i16 %a, i16 %b) {
+; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_sgt_i16(i16 %a, i16 %b) {
+; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_sge_i16(i16 %a, i16 %b) {
+; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_slt_i16(i16 %a, i16 %b) {
+; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+define i16 @icmp_sle_i16(i16 %a, i16 %b) {
+; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i16 %a, %b
+  %ret = zext i1 %cmp to i16
+  ret i16 %ret
+}
+
+
+;;; i8
+
+define i8 @icmp_eq_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp eq i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ne_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ne i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ugt_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ugt i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_uge_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp uge i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ult_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ult i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_ule_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp ule i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_sgt_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sgt i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_sge_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sge i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_slt_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp slt i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
+
+define i8 @icmp_sle_i8(i8 %a, i8 %b) {
+; Comparison happens in 16-bit
+; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}}
+; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: ret
+  %cmp = icmp sle i8 %a, %b
+  %ret = zext i1 %cmp to i8
+  ret i8 %ret
+}
--- a/test/CodeGen/NVPTX/convert-fp.ll
+++ b/test/CodeGen/NVPTX/convert-fp.ll
@ -0,0 +1,146 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+define i16 @cvt_i16_f32(float %x) {
+; CHECK: cvt.rzi.u16.f32 %rs{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui float %x to i16
+  ret i16 %a
+}
+
+define i16 @cvt_i16_f64(double %x) {
+; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui double %x to i16
+  ret i16 %a
+}
+
+define i32 @cvt_i32_f32(float %x) {
+; CHECK: cvt.rzi.u32.f32 %r{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui float %x to i32
+  ret i32 %a
+}
+
+define i32 @cvt_i32_f64(double %x) {
+; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui double %x to i32
+  ret i32 %a
+}
+
+
+define i64 @cvt_i64_f32(float %x) {
+; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui float %x to i64
+  ret i64 %a
+}
+
+define i64 @cvt_i64_f64(double %x) {
+; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptoui double %x to i64
+  ret i64 %a
+}
+
+define float @cvt_f32_i16(i16 %x) {
+; CHECK: cvt.rn.f32.u16 %f{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i16 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_i32(i32 %x) {
+; CHECK: cvt.rn.f32.u32 %f{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i32 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_i64(i64 %x) {
+; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i64 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_f64(double %x) {
+; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fptrunc double %x to float
+  ret float %a
+}
+
+define float @cvt_f32_s16(i16 %x) {
+; CHECK: cvt.rn.f32.s16 %f{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i16 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_s32(i32 %x) {
+; CHECK: cvt.rn.f32.s32 %f{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i32 %x to float
+  ret float %a
+}
+
+define float @cvt_f32_s64(i64 %x) {
+; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i64 %x to float
+  ret float %a
+}
+
+define double @cvt_f64_i16(i16 %x) {
+; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i16 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_i32(i32 %x) {
+; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i32 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_i64(i64 %x) {
+; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: ret;
+  %a = uitofp i64 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_f32(float %x) {
+; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fpext float %x to double
+  ret double %a
+}
+
+define double @cvt_f64_s16(i16 %x) {
+; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i16 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_s32(i32 %x) {
+; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i32 %x to double
+  ret double %a
+}
+
+define double @cvt_f64_s64(i64 %x) {
+; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = sitofp i64 %x to double
+  ret double %a
+}
--- a/test/CodeGen/NVPTX/convert-int-sm10.ll
+++ b/test/CodeGen/NVPTX/convert-int-sm10.ll
@ -0,0 +1,55 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+
+; i16
+
+define i16 @cvt_i16_i32(i32 %x) {
+; CHECK: cvt.u16.u32 %rs{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = trunc i32 %x to i16
+  ret i16 %a
+}
+
+define i16 @cvt_i16_i64(i64 %x) {
+; CHECK: cvt.u16.u64 %rs{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = trunc i64 %x to i16
+  ret i16 %a
+}
+
+
+
+; i32
+
+define i32 @cvt_i32_i16(i16 %x) {
+; CHECK: cvt.u32.u16 %r{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = zext i16 %x to i32
+  ret i32 %a
+}
+
+define i32 @cvt_i32_i64(i64 %x) {
+; CHECK: cvt.u32.u64 %r{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: ret
+  %a = trunc i64 %x to i32
+  ret i32 %a
+}
+
+
+
+; i64
+
+define i64 @cvt_i64_i16(i16 %x) {
+; CHECK: cvt.u64.u16 %rl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: ret
+  %a = zext i16 %x to i64
+  ret i64 %a
+}
+
+define i64 @cvt_i64_i32(i32 %x) {
+; CHECK: cvt.u64.u32 %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: ret
+  %a = zext i32 %x to i64
+  ret i64 %a
+}
--- a/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/test/CodeGen/NVPTX/convert-int-sm20.ll
@ -0,0 +1,64 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+;; Integer conversions happen inplicitly by loading/storing the proper types
+
+
+; i16
+
+define i16 @cvt_i16_i32(i32 %x) {
+; CHECK: ld.param.u16 %rs[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b16 [func_retval{{[0-9]+}}+0], %rs[[R0]]
+; CHECK: ret
+  %a = trunc i32 %x to i16
+  ret i16 %a
+}
+
+define i16 @cvt_i16_i64(i64 %x) {
+; CHECK: ld.param.u16 %rs[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}]
+; CHECK: st.param.b16 [func_retval{{[0-9]+}}+0], %rs[[R0]]
+; CHECK: ret
+  %a = trunc i64 %x to i16
+  ret i16 %a
+}
+
+
+
+; i32
+
+define i32 @cvt_i32_i16(i16 %x) {
+; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]]
+; CHECK: ret
+  %a = zext i16 %x to i32
+  ret i32 %a
+}
+
+define i32 @cvt_i32_i64(i64 %x) {
+; CHECK: ld.param.u32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}]
+; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]]
+; CHECK: ret
+  %a = trunc i64 %x to i32
+  ret i32 %a
+}
+
+
+
+; i64
+
+define i64 @cvt_i64_i16(i16 %x) {
+; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ret
+  %a = zext i16 %x to i64
+  ret i64 %a
+}
+
+define i64 @cvt_i64_i32(i32 %x) {
+; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ret
+  %a = zext i32 %x to i64
+  ret i64 %a
+}
--- a/test/CodeGen/NVPTX/fma-disable.ll
+++ b/test/CodeGen/NVPTX/fma-disable.ll
@ -0,0 +1,24 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=1 | FileCheck %s -check-prefix=FMA
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -nvptx-fma-level=0 | FileCheck %s -check-prefix=MUL
+
+define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
+entry:
+; FMA: fma.rn.f32
+; MUL: mul.rn.f32
+; MUL: add.rn.f32
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device double @test_mul_add_d(double %x, double %y, double %z) {
+entry:
+; FMA: fma.rn.f64
+; MUL: mul.rn.f64
+; MUL: add.rn.f64
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@ -0,0 +1,17 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+define ptx_device float @t1_f32(float %x, float %y, float %z) {
+; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device double @t1_f64(double %x, double %y, double %z) {
+; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: ret;
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@ -0,0 +1,284 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+define ptx_device i32 @test_tid_x() {
+; CHECK: mov.u32 %r0, %tid.x;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.tid.x()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_tid_y() {
+; CHECK: mov.u32 %r0, %tid.y;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.tid.y()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_tid_z() {
+; CHECK: mov.u32 %r0, %tid.z;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.tid.z()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_tid_w() {
+; CHECK: mov.u32 %r0, %tid.w;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.tid.w()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ntid_x() {
+; CHECK: mov.u32 %r0, %ntid.x;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ntid.x()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ntid_y() {
+; CHECK: mov.u32 %r0, %ntid.y;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ntid.y()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ntid_z() {
+; CHECK: mov.u32 %r0, %ntid.z;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ntid.z()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ntid_w() {
+; CHECK: mov.u32 %r0, %ntid.w;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ntid.w()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_laneid() {
+; CHECK: mov.u32 %r0, %laneid;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.laneid()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_warpid() {
+; CHECK: mov.u32 %r0, %warpid;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.warpid()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_nwarpid() {
+; CHECK: mov.u32 %r0, %nwarpid;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.nwarpid()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ctaid_x() {
+; CHECK: mov.u32 %r0, %ctaid.x;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ctaid.x()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ctaid_y() {
+; CHECK: mov.u32 %r0, %ctaid.y;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ctaid.y()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ctaid_z() {
+; CHECK: mov.u32 %r0, %ctaid.z;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ctaid.z()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_ctaid_w() {
+; CHECK: mov.u32 %r0, %ctaid.w;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.ctaid.w()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_nctaid_x() {
+; CHECK: mov.u32 %r0, %nctaid.x;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.nctaid.x()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_nctaid_y() {
+; CHECK: mov.u32 %r0, %nctaid.y;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.nctaid.y()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_nctaid_z() {
+; CHECK: mov.u32 %r0, %nctaid.z;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.nctaid.z()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_nctaid_w() {
+; CHECK: mov.u32 %r0, %nctaid.w;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.nctaid.w()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_smid() {
+; CHECK: mov.u32 %r0, %smid;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.smid()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_nsmid() {
+; CHECK: mov.u32 %r0, %nsmid;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.nsmid()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_gridid() {
+; CHECK: mov.u32 %r0, %gridid;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.gridid()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_lanemask_eq() {
+; CHECK: mov.u32 %r0, %lanemask_eq;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.lanemask.eq()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_lanemask_le() {
+; CHECK: mov.u32 %r0, %lanemask_le;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.lanemask.le()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_lanemask_lt() {
+; CHECK: mov.u32 %r0, %lanemask_lt;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.lanemask.lt()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_lanemask_ge() {
+; CHECK: mov.u32 %r0, %lanemask_ge;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.lanemask.ge()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_lanemask_gt() {
+; CHECK: mov.u32 %r0, %lanemask_gt;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.lanemask.gt()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_clock() {
+; CHECK: mov.u32 %r0, %clock;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.clock()
+	ret i32 %x
+}
+
+define ptx_device i64 @test_clock64() {
+; CHECK: mov.u64 %rl0, %clock64;
+; CHECK: ret;
+	%x = call i64 @llvm.ptx.read.clock64()
+	ret i64 %x
+}
+
+define ptx_device i32 @test_pm0() {
+; CHECK: mov.u32 %r0, %pm0;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.pm0()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_pm1() {
+; CHECK: mov.u32 %r0, %pm1;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.pm1()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_pm2() {
+; CHECK: mov.u32 %r0, %pm2;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.pm2()
+	ret i32 %x
+}
+
+define ptx_device i32 @test_pm3() {
+; CHECK: mov.u32 %r0, %pm3;
+; CHECK: ret;
+	%x = call i32 @llvm.ptx.read.pm3()
+	ret i32 %x
+}
+
+define ptx_device void @test_bar_sync() {
+; CHECK: bar.sync 0
+; CHECK: ret;
+	call void @llvm.ptx.bar.sync(i32 0)
+	ret void
+}
+
+declare i32 @llvm.ptx.read.tid.x()
+declare i32 @llvm.ptx.read.tid.y()
+declare i32 @llvm.ptx.read.tid.z()
+declare i32 @llvm.ptx.read.tid.w()
+declare i32 @llvm.ptx.read.ntid.x()
+declare i32 @llvm.ptx.read.ntid.y()
+declare i32 @llvm.ptx.read.ntid.z()
+declare i32 @llvm.ptx.read.ntid.w()
+
+declare i32 @llvm.ptx.read.laneid()
+declare i32 @llvm.ptx.read.warpid()
+declare i32 @llvm.ptx.read.nwarpid()
+
+declare i32 @llvm.ptx.read.ctaid.x()
+declare i32 @llvm.ptx.read.ctaid.y()
+declare i32 @llvm.ptx.read.ctaid.z()
+declare i32 @llvm.ptx.read.ctaid.w()
+declare i32 @llvm.ptx.read.nctaid.x()
+declare i32 @llvm.ptx.read.nctaid.y()
+declare i32 @llvm.ptx.read.nctaid.z()
+declare i32 @llvm.ptx.read.nctaid.w()
+
+declare i32 @llvm.ptx.read.smid()
+declare i32 @llvm.ptx.read.nsmid()
+declare i32 @llvm.ptx.read.gridid()
+
+declare i32 @llvm.ptx.read.lanemask.eq()
+declare i32 @llvm.ptx.read.lanemask.le()
+declare i32 @llvm.ptx.read.lanemask.lt()
+declare i32 @llvm.ptx.read.lanemask.ge()
+declare i32 @llvm.ptx.read.lanemask.gt()
+
+declare i32 @llvm.ptx.read.clock()
+declare i64 @llvm.ptx.read.clock64()
+
+declare i32 @llvm.ptx.read.pm0()
+declare i32 @llvm.ptx.read.pm1()
+declare i32 @llvm.ptx.read.pm2()
+declare i32 @llvm.ptx.read.pm3()
+
+declare void @llvm.ptx.bar.sync(i32 %i)
--- a/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll
@ -0,0 +1,173 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s --check-prefix=PTX64
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
+; PTX32: ld.global.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(1)* %ptr
+  ret i8 %a
+}
+
+define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(3)* %ptr
+  ret i8 %a
+}
+
+define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
+; PTX32: ld.local.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(5)* %ptr
+  ret i8 %a
+}
+
+;; i16
+define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
+; PTX32: ld.global.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(1)* %ptr
+  ret i16 %a
+}
+
+define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(3)* %ptr
+  ret i16 %a
+}
+
+define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
+; PTX32: ld.local.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(5)* %ptr
+  ret i16 %a
+}
+
+;; i32
+define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
+; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(1)* %ptr
+  ret i32 %a
+}
+
+define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(3)* %ptr
+  ret i32 %a
+}
+
+define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
+; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(5)* %ptr
+  ret i32 %a
+}
+
+;; i64
+define i64 @ld_global_i64(i64 addrspace(1)* %ptr) {
+; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(1)* %ptr
+  ret i64 %a
+}
+
+define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) {
+; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(3)* %ptr
+  ret i64 %a
+}
+
+define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
+; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(5)* %ptr
+  ret i64 %a
+}
+
+;; f32
+define float @ld_global_f32(float addrspace(1)* %ptr) {
+; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(1)* %ptr
+  ret float %a
+}
+
+define float @ld_shared_f32(float addrspace(3)* %ptr) {
+; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(3)* %ptr
+  ret float %a
+}
+
+define float @ld_local_f32(float addrspace(5)* %ptr) {
+; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(5)* %ptr
+  ret float %a
+}
+
+;; f64
+define double @ld_global_f64(double addrspace(1)* %ptr) {
+; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(1)* %ptr
+  ret double %a
+}
+
+define double @ld_shared_f64(double addrspace(3)* %ptr) {
+; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(3)* %ptr
+  ret double %a
+}
+
+define double @ld_local_f64(double addrspace(5)* %ptr) {
+; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(5)* %ptr
+  ret double %a
+}
--- a/test/CodeGen/NVPTX/ld-generic.ll
+++ b/test/CodeGen/NVPTX/ld-generic.ll
@ -0,0 +1,63 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
+; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i8 addrspace(0)* %ptr
+  ret i8 %a
+}
+
+;; i16
+define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
+; PTX32: ld.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i16 addrspace(0)* %ptr
+  ret i16 %a
+}
+
+;; i32
+define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
+; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i32 addrspace(0)* %ptr
+  ret i32 %a
+}
+
+;; i64
+define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
+; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load i64 addrspace(0)* %ptr
+  ret i64 %a
+}
+
+;; f32
+define float @ld_global_f32(float addrspace(0)* %ptr) {
+; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load float addrspace(0)* %ptr
+  ret float %a
+}
+
+;; f64
+define double @ld_global_f64(double addrspace(0)* %ptr) {
+; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ret
+; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ret
+  %a = load double addrspace(0)* %ptr
+  ret double %a
+}
--- a/test/CodeGen/NVPTX/lit.local.cfg
+++ b/test/CodeGen/NVPTX/lit.local.cfg
@ -0,0 +1,5 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'NVPTX' in targets:
+    config.unsupported = True
--- a/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/test/CodeGen/NVPTX/st-addrspace.ll
@ -0,0 +1,179 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s --check-prefix=PTX64
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+
+define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
+; PTX32: st.global.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
+; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
+; PTX32: st.local.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(5)* %ptr
+  ret void
+}
+
+;; i16
+
+define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
+; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
+; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
+; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(5)* %ptr
+  ret void
+}
+
+;; i32
+
+define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
+; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
+; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
+; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(5)* %ptr
+  ret void
+}
+
+;; i64
+
+define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) {
+; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) {
+; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
+; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(5)* %ptr
+  ret void
+}
+
+;; f32
+
+define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
+; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
+; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
+; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(5)* %ptr
+  ret void
+}
+
+;; f64
+
+define void @st_global_f64(double addrspace(1)* %ptr, double %a) {
+; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(1)* %ptr
+  ret void
+}
+
+define void @st_shared_f64(double addrspace(3)* %ptr, double %a) {
+; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(3)* %ptr
+  ret void
+}
+
+define void @st_local_f64(double addrspace(5)* %ptr, double %a) {
+; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(5)* %ptr
+  ret void
+}
--- a/test/CodeGen/NVPTX/st-generic.ll
+++ b/test/CodeGen/NVPTX/st-generic.ll
@ -0,0 +1,69 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+
+;; i8
+
+define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
+; PTX32: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}
+; PTX64: ret
+  store i8 %a, i8 addrspace(0)* %ptr
+  ret void
+}
+
+;; i16
+
+define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
+; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: ret
+  store i16 %a, i16 addrspace(0)* %ptr
+  ret void
+}
+
+;; i32
+
+define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
+; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: ret
+  store i32 %a, i32 addrspace(0)* %ptr
+  ret void
+}
+
+;; i64
+
+define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
+; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: ret
+  store i64 %a, i64 addrspace(0)* %ptr
+  ret void
+}
+
+;; f32
+
+define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
+; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: ret
+  store float %a, float addrspace(0)* %ptr
+  ret void
+}
+
+;; f64
+
+define void @st_global_f64(double addrspace(0)* %ptr, double %a) {
+; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: ret
+; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: ret
+  store double %a, double addrspace(0)* %ptr
+  ret void
+}