diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index b1ed2df7a17..32bb279f0e7 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2062,61 +2062,33 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { // // i32,ch = load t0, t7, undef:i64 // - // Since we load an i8 value, the matching logic above will have selected an - // LDG instruction that reads i8 and stores it in an i16 register (NVPTX does - // not expose 8-bit registers): - // - // i16,ch = INT_PTX_LDG_GLOBAL_i8areg64 t7, t0 - // - // To get the correct type in this case, truncate back to i8 and then extend - // to the original load type. + // In this case, the matching logic above will select a load for the original + // memory type (in this case, i8) and our types will not match (the node needs + // to return an i32 in this case). Our LDG/LDU nodes do not support the + // concept of sign-/zero-extension, so emulate it here by adding an explicit + // CVT instruction. Ptxas should clean up any redundancies here. + EVT OrigType = N->getValueType(0); - LoadSDNode *LDSD = dyn_cast(N); - if (LDSD && EltVT == MVT::i8 && OrigType.getScalarSizeInBits() >= 32) { - unsigned CvtOpc = 0; + LoadSDNode *LdNode = dyn_cast(N); - switch (LDSD->getExtensionType()) { - default: - llvm_unreachable("An extension is required for i8 loads"); - break; - case ISD::SEXTLOAD: - switch (OrigType.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("Unhandled integer load type"); - break; - case MVT::i32: - CvtOpc = NVPTX::CVT_s32_s8; - break; - case MVT::i64: - CvtOpc = NVPTX::CVT_s64_s8; - break; - } - break; - case ISD::EXTLOAD: - case ISD::ZEXTLOAD: - switch (OrigType.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("Unhandled integer load type"); - break; - case MVT::i32: - CvtOpc = NVPTX::CVT_u32_u8; - break; - case MVT::i64: - CvtOpc = NVPTX::CVT_u64_u8; - break; - } - break; - } + if (OrigType != EltVT && LdNode) { + // We have an extending-load. The instruction we selected operates on the + // smaller type, but the SDNode we are replacing has the larger type. We + // need to emit a CVT to make the types match. + bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD; + unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(), + EltVT.getSimpleVT(), IsSigned); - // For each output value, truncate to i8 (since the upper 8 bits are - // undefined) and then extend to the desired type. + // For each output value, apply the manual sign/zero-extension and make sure + // all users of the load go through that CVT. for (unsigned i = 0; i != NumElts; ++i) { SDValue Res(LD, i); SDValue OrigVal(N, i); SDNode *CvtNode = CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, - CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32)); + CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, + DL, MVT::i32)); ReplaceUses(OrigVal, SDValue(CvtNode, 0)); } } @@ -5199,3 +5171,57 @@ bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( } return true; } + +/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a +/// conversion from \p SrcTy to \p DestTy. +unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, + bool IsSigned) { + switch (SrcTy.SimpleTy) { + default: + llvm_unreachable("Unhandled source type"); + case MVT::i8: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i16: + return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; + case MVT::i32: + return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; + case MVT::i64: + return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; + } + case MVT::i16: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i8: + return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; + case MVT::i32: + return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; + case MVT::i64: + return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; + } + case MVT::i32: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i8: + return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; + case MVT::i16: + return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; + case MVT::i64: + return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; + } + case MVT::i64: + switch (DestTy.SimpleTy) { + default: + llvm_unreachable("Unhandled dest type"); + case MVT::i8: + return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; + case MVT::i16: + return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; + case MVT::i32: + return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; + } + } +} diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b0fb63f7fe9..d62cc304e3c 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -93,6 +93,7 @@ private: bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const; + static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, bool IsSigned); }; } // end namespace llvm diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 50d9ea05545..c158cc6cdab 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -377,6 +377,8 @@ let hasSideEffects = 0 in { } // Generate cvts from all types to all types. + defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; + defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; diff --git a/test/CodeGen/NVPTX/bug26185-2.ll b/test/CodeGen/NVPTX/bug26185-2.ll new file mode 100644 index 00000000000..55e9dad96c0 --- /dev/null +++ b/test/CodeGen/NVPTX/bug26185-2.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s + +; Verify that we correctly emit code for extending ldg/ldu. We do not expose +; extending variants in the backend, but the ldg/ldu selection code may pick +; extending loads as candidates. We do want to support this, so make sure we +; emit the necessary cvt.* instructions to implement the extension and let ptxas +; emit the real extending loads. + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: spam +define ptx_kernel void @spam(i8 addrspace(1)* noalias nocapture readonly %arg, i8 addrspace(1)* noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 { +bb: + %tmp = bitcast i8 addrspace(1)* %arg to i16 addrspace(1)* + %tmp4 = bitcast i8 addrspace(1)* %arg1 to i64 addrspace(1)* + %tmp5 = add nsw i64 %arg3, 8 + %tmp6 = getelementptr i16, i16 addrspace(1)* %tmp, i64 %tmp5 +; CHECK: ld.global.nc.u16 + %tmp7 = load i16, i16 addrspace(1)* %tmp6, align 2 +; CHECK: cvt.s32.s16 + %tmp8 = sext i16 %tmp7 to i64 + %tmp9 = mul nsw i64 %tmp8, %tmp8 + %tmp10 = load i64, i64 addrspace(1)* %tmp4, align 8 + %tmp11 = add nsw i64 %tmp9, %tmp10 + store i64 %tmp11, i64 addrspace(1)* %tmp4, align 8 + ret void +} + +attributes #0 = { norecurse nounwind "polly.skip.fn" } + +!nvvm.annotations = !{!0} + +!0 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i64, i64)* @spam, !"maxntidx", i64 1, !"maxntidy", i64 1, !"maxntidz", i64 1}