1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

[NVPTX] Honor alignment on vector loads/stores

We were not considering the stated alignment on vector loads/stores,
leading us to generate vector instructions even when we do not have
sufficient alignment.

Now, for IR like:

  %1 = load <4 x float>, <4 x float>* %ptr, align 4

we will generate correct, conservative PTX like:

  ld.f32 ... [%ptr]
  ld.f32 ... [%ptr+4]
  ld.f32 ... [%ptr+8]
  ld.f32 ... [%ptr+12]

Or if we have an alignment of 8 (for example), we can
generate code like:

  ld.v2.f32 ... [%ptr]
  ld.v2.f32 ... [%ptr+8]

llvm-svn: 213186
This commit is contained in:
Justin Holewinski 2014-07-16 19:45:35 +00:00
parent d7eaaec8e8
commit 35f9408e7f
2 changed files with 108 additions and 5 deletions

View File

@ -1494,6 +1494,21 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
break;
}
MemSDNode *MemSD = cast<MemSDNode>(N);
const DataLayout *TD = getDataLayout();
unsigned Align = MemSD->getAlignment();
unsigned PrefAlign =
TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
if (Align < PrefAlign) {
// This store is not sufficiently aligned, so bail out and let this vector
// store be scalarized. Note that we may still be able to emit smaller
// vector stores. For example, if we are storing a <4 x float> with an
// alignment of 8, this check will fail but the legalizer will try again
// with 2 x <2 x float>, which will succeed with an alignment of 8.
return SDValue();
}
unsigned Opcode = 0;
EVT EltVT = ValVT.getVectorElementType();
unsigned NumElts = ValVT.getVectorNumElements();
@ -1536,8 +1551,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
Ops.push_back(N->getOperand(i));
}
MemSDNode *MemSD = cast<MemSDNode>(N);
SDValue NewSt = DAG.getMemIntrinsicNode(
Opcode, DL, DAG.getVTList(MVT::Other), Ops,
MemSD->getMemoryVT(), MemSD->getMemOperand());
@ -3046,6 +3059,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
const DataLayout *TD,
SmallVectorImpl<SDValue> &Results) {
EVT ResVT = N->getValueType(0);
SDLoc DL(N);
@ -3073,6 +3087,20 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
break;
}
LoadSDNode *LD = cast<LoadSDNode>(N);
unsigned Align = LD->getAlignment();
unsigned PrefAlign =
TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
if (Align < PrefAlign) {
// This load is not sufficiently aligned, so bail out and let this vector
// load be scalarized. Note that we may still be able to emit smaller
// vector loads. For example, if we are loading a <4 x float> with an
// alignment of 8, this check will fail but the legalizer will try again
// with 2 x <2 x float>, which will succeed with an alignment of 8.
return;
}
EVT EltVT = ResVT.getVectorElementType();
unsigned NumElts = ResVT.getVectorNumElements();
@ -3109,8 +3137,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
OtherOps.push_back(N->getOperand(i));
LoadSDNode *LD = cast<LoadSDNode>(N);
// The select routine does not have access to the LoadSDNode instance, so
// pass along the extension information
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
@ -3283,7 +3309,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
default:
report_fatal_error("Unhandled custom legalization");
case ISD::LOAD:
ReplaceLoadVector(N, DAG, Results);
ReplaceLoadVector(N, DAG, getDataLayout(), Results);
return;
case ISD::INTRINSIC_W_CHAIN:
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);

View File

@ -0,0 +1,77 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
; CHECK-LABEL: t1
define <4 x float> @t1(i8* %p1) {
; CHECK-NOT: ld.v4
; CHECK-NOT: ld.v2
; CHECK-NOT: ld.f32
; CHECK: ld.u8
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>* %cast, align 1
ret <4 x float> %r
}
; CHECK-LABEL: t2
define <4 x float> @t2(i8* %p1) {
; CHECK-NOT: ld.v4
; CHECK-NOT: ld.v2
; CHECK: ld.f32
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>* %cast, align 4
ret <4 x float> %r
}
; CHECK-LABEL: t3
define <4 x float> @t3(i8* %p1) {
; CHECK-NOT: ld.v4
; CHECK: ld.v2
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>* %cast, align 8
ret <4 x float> %r
}
; CHECK-LABEL: t4
define <4 x float> @t4(i8* %p1) {
; CHECK: ld.v4
%cast = bitcast i8* %p1 to <4 x float>*
%r = load <4 x float>* %cast, align 16
ret <4 x float> %r
}
; CHECK-LABEL: s1
define void @s1(<4 x float>* %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
; CHECK-NOT: st.v2
; CHECK-NOT: st.f32
; CHECK: st.u8
store <4 x float> %v, <4 x float>* %p1, align 1
ret void
}
; CHECK-LABEL: s2
define void @s2(<4 x float>* %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
; CHECK-NOT: st.v2
; CHECK: st.f32
store <4 x float> %v, <4 x float>* %p1, align 4
ret void
}
; CHECK-LABEL: s3
define void @s3(<4 x float>* %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
store <4 x float> %v, <4 x float>* %p1, align 8
ret void
}
; CHECK-LABEL: s4
define void @s4(<4 x float>* %p1, <4 x float> %v) {
; CHECK: st.v4
store <4 x float> %v, <4 x float>* %p1, align 16
ret void
}