mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
[NVPTX] Honor alignment on vector loads/stores
We were not considering the stated alignment on vector loads/stores, leading us to generate vector instructions even when we do not have sufficient alignment. Now, for IR like: %1 = load <4 x float>, <4 x float>* %ptr, align 4 we will generate correct, conservative PTX like: ld.f32 ... [%ptr] ld.f32 ... [%ptr+4] ld.f32 ... [%ptr+8] ld.f32 ... [%ptr+12] Or if we have an alignment of 8 (for example), we can generate code like: ld.v2.f32 ... [%ptr] ld.v2.f32 ... [%ptr+8] llvm-svn: 213186
This commit is contained in:
parent
d7eaaec8e8
commit
35f9408e7f
@ -1494,6 +1494,21 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
break;
|
||||
}
|
||||
|
||||
MemSDNode *MemSD = cast<MemSDNode>(N);
|
||||
const DataLayout *TD = getDataLayout();
|
||||
|
||||
unsigned Align = MemSD->getAlignment();
|
||||
unsigned PrefAlign =
|
||||
TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
|
||||
if (Align < PrefAlign) {
|
||||
// This store is not sufficiently aligned, so bail out and let this vector
|
||||
// store be scalarized. Note that we may still be able to emit smaller
|
||||
// vector stores. For example, if we are storing a <4 x float> with an
|
||||
// alignment of 8, this check will fail but the legalizer will try again
|
||||
// with 2 x <2 x float>, which will succeed with an alignment of 8.
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
unsigned Opcode = 0;
|
||||
EVT EltVT = ValVT.getVectorElementType();
|
||||
unsigned NumElts = ValVT.getVectorNumElements();
|
||||
@ -1536,8 +1551,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
Ops.push_back(N->getOperand(i));
|
||||
}
|
||||
|
||||
MemSDNode *MemSD = cast<MemSDNode>(N);
|
||||
|
||||
SDValue NewSt = DAG.getMemIntrinsicNode(
|
||||
Opcode, DL, DAG.getVTList(MVT::Other), Ops,
|
||||
MemSD->getMemoryVT(), MemSD->getMemOperand());
|
||||
@ -3046,6 +3059,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
|
||||
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
|
||||
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
const DataLayout *TD,
|
||||
SmallVectorImpl<SDValue> &Results) {
|
||||
EVT ResVT = N->getValueType(0);
|
||||
SDLoc DL(N);
|
||||
@ -3073,6 +3087,20 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
break;
|
||||
}
|
||||
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
|
||||
unsigned Align = LD->getAlignment();
|
||||
unsigned PrefAlign =
|
||||
TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
|
||||
if (Align < PrefAlign) {
|
||||
// This load is not sufficiently aligned, so bail out and let this vector
|
||||
// load be scalarized. Note that we may still be able to emit smaller
|
||||
// vector loads. For example, if we are loading a <4 x float> with an
|
||||
// alignment of 8, this check will fail but the legalizer will try again
|
||||
// with 2 x <2 x float>, which will succeed with an alignment of 8.
|
||||
return;
|
||||
}
|
||||
|
||||
EVT EltVT = ResVT.getVectorElementType();
|
||||
unsigned NumElts = ResVT.getVectorNumElements();
|
||||
|
||||
@ -3109,8 +3137,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
|
||||
OtherOps.push_back(N->getOperand(i));
|
||||
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
|
||||
// The select routine does not have access to the LoadSDNode instance, so
|
||||
// pass along the extension information
|
||||
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
|
||||
@ -3283,7 +3309,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
|
||||
default:
|
||||
report_fatal_error("Unhandled custom legalization");
|
||||
case ISD::LOAD:
|
||||
ReplaceLoadVector(N, DAG, Results);
|
||||
ReplaceLoadVector(N, DAG, getDataLayout(), Results);
|
||||
return;
|
||||
case ISD::INTRINSIC_W_CHAIN:
|
||||
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
|
||||
|
77
test/CodeGen/NVPTX/misaligned-vector-ldst.ll
Normal file
77
test/CodeGen/NVPTX/misaligned-vector-ldst.ll
Normal file
@ -0,0 +1,77 @@
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
; CHECK-LABEL: t1
|
||||
define <4 x float> @t1(i8* %p1) {
|
||||
; CHECK-NOT: ld.v4
|
||||
; CHECK-NOT: ld.v2
|
||||
; CHECK-NOT: ld.f32
|
||||
; CHECK: ld.u8
|
||||
%cast = bitcast i8* %p1 to <4 x float>*
|
||||
%r = load <4 x float>* %cast, align 1
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
; CHECK-LABEL: t2
|
||||
define <4 x float> @t2(i8* %p1) {
|
||||
; CHECK-NOT: ld.v4
|
||||
; CHECK-NOT: ld.v2
|
||||
; CHECK: ld.f32
|
||||
%cast = bitcast i8* %p1 to <4 x float>*
|
||||
%r = load <4 x float>* %cast, align 4
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
; CHECK-LABEL: t3
|
||||
define <4 x float> @t3(i8* %p1) {
|
||||
; CHECK-NOT: ld.v4
|
||||
; CHECK: ld.v2
|
||||
%cast = bitcast i8* %p1 to <4 x float>*
|
||||
%r = load <4 x float>* %cast, align 8
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
; CHECK-LABEL: t4
|
||||
define <4 x float> @t4(i8* %p1) {
|
||||
; CHECK: ld.v4
|
||||
%cast = bitcast i8* %p1 to <4 x float>*
|
||||
%r = load <4 x float>* %cast, align 16
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
|
||||
; CHECK-LABEL: s1
|
||||
define void @s1(<4 x float>* %p1, <4 x float> %v) {
|
||||
; CHECK-NOT: st.v4
|
||||
; CHECK-NOT: st.v2
|
||||
; CHECK-NOT: st.f32
|
||||
; CHECK: st.u8
|
||||
store <4 x float> %v, <4 x float>* %p1, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: s2
|
||||
define void @s2(<4 x float>* %p1, <4 x float> %v) {
|
||||
; CHECK-NOT: st.v4
|
||||
; CHECK-NOT: st.v2
|
||||
; CHECK: st.f32
|
||||
store <4 x float> %v, <4 x float>* %p1, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: s3
|
||||
define void @s3(<4 x float>* %p1, <4 x float> %v) {
|
||||
; CHECK-NOT: st.v4
|
||||
store <4 x float> %v, <4 x float>* %p1, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: s4
|
||||
define void @s4(<4 x float>* %p1, <4 x float> %v) {
|
||||
; CHECK: st.v4
|
||||
store <4 x float> %v, <4 x float>* %p1, align 16
|
||||
ret void
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user