mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[X86] Enable interleaved memory access by default
This lets the loop vectorizer generate interleaved memory accesses on x86. Differential Revision: https://reviews.llvm.org/D25350 llvm-svn: 284779
This commit is contained in:
parent
aabe0b98f9
commit
ae8887d98b
@ -1767,3 +1767,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
|
||||
// correct.
|
||||
return (CallerBits & CalleeBits) == CalleeBits;
|
||||
}
|
||||
|
||||
bool X86TTIImpl::enableInterleavedAccessVectorization() {
|
||||
// TODO: We expect this to be beneficial regardless of arch,
|
||||
// but there are currently some unexplained performance artifacts on Atom.
|
||||
// As a temporary solution, disable on Atom.
|
||||
return !(ST->isAtom() || ST->isSLM());
|
||||
}
|
||||
|
@ -93,6 +93,8 @@ public:
|
||||
bool isLegalMaskedScatter(Type *DataType);
|
||||
bool areInlineCompatible(const Function *Caller,
|
||||
const Function *Callee) const;
|
||||
|
||||
bool enableInterleavedAccessVectorization();
|
||||
private:
|
||||
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
|
||||
unsigned Alignment, unsigned AddressSpace);
|
||||
|
@ -67,7 +67,7 @@ for:
|
||||
%t2 = load float, float* %arrayidx3, align 4
|
||||
%add = fadd fast float %t1, %s.02
|
||||
%add4 = fadd fast float %add, %t2
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
|
||||
%cmp1 = icmp slt i64 %indvars.iv.next, %t0
|
||||
br i1 %cmp1, label %for, label %loopexit
|
||||
|
||||
|
@ -85,7 +85,7 @@ for.end: ; preds = %for.cond
|
||||
; The source code
|
||||
;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
|
||||
;
|
||||
; for (int i=0; i<SIZE; ++i) {
|
||||
; for (int i=0; i<SIZE; i += 16) {
|
||||
; if (trigger[i] > 0) {
|
||||
; out[i] = in[i].b + (float) 0.5;
|
||||
; }
|
||||
@ -95,9 +95,9 @@ for.end: ; preds = %for.cond
|
||||
%struct.In = type { float, float }
|
||||
|
||||
;AVX512-LABEL: @foo2
|
||||
;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
|
||||
;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
|
||||
;AVX512: llvm.masked.gather.v16f32
|
||||
;AVX512: llvm.masked.store.v16f32
|
||||
;AVX512: llvm.masked.scatter.v16f32
|
||||
;AVX512: ret void
|
||||
define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
|
||||
entry:
|
||||
@ -147,7 +147,7 @@ if.end: ; preds = %if.then, %for.body
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%9 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %9, 1
|
||||
%inc = add nsw i32 %9, 16
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
@ -162,7 +162,7 @@ for.end: ; preds = %for.cond
|
||||
;};
|
||||
;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
|
||||
;
|
||||
; for (int i=0; i<SIZE; ++i) {
|
||||
; for (int i=0; i<SIZE; i += 16) {
|
||||
; if (trigger[i] > 0) {
|
||||
; out[i].b = in[i].b + (float) 0.5;
|
||||
; }
|
||||
@ -170,10 +170,10 @@ for.end: ; preds = %for.cond
|
||||
;}
|
||||
|
||||
;AVX512-LABEL: @foo3
|
||||
;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
|
||||
;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
|
||||
;AVX512: llvm.masked.gather.v16f32
|
||||
;AVX512: fadd <16 x float>
|
||||
;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
|
||||
;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1
|
||||
;AVX512: llvm.masked.scatter.v16f32
|
||||
;AVX512: ret void
|
||||
|
||||
@ -226,7 +226,7 @@ if.end: ; preds = %if.then, %for.body
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%9 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %9, 1
|
||||
%inc = add nsw i32 %9, 16
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
|
35
test/Transforms/LoopVectorize/X86/interleaving.ll
Normal file
35
test/Transforms/LoopVectorize/X86/interleaving.ll
Normal file
@ -0,0 +1,35 @@
|
||||
; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
|
||||
; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
|
||||
|
||||
; NORMAL-LABEL: foo
|
||||
; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
|
||||
; NORMAL: %[[STRIDED1:.*]] = shufflevector <8 x i32> %[[WIDE]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
|
||||
|
||||
; ATOM-LABEL: foo
|
||||
; ATOM: load i32
|
||||
; ATOM: load i32
|
||||
; ATOM: store i32
|
||||
define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%0 = shl nsw i64 %indvars.iv, 1
|
||||
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %0
|
||||
%1 = load i32, i32* %arrayidx, align 4
|
||||
%2 = or i64 %0, 1
|
||||
%arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %2
|
||||
%3 = load i32, i32* %arrayidx3, align 4
|
||||
%add4 = add nsw i32 %3, %1
|
||||
%arrayidx6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
store i32 %add4, i32* %arrayidx6, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 1024
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body
|
||||
}
|
@ -341,7 +341,7 @@ for.end: ; preds = %for.cond
|
||||
;
|
||||
;void foo4(double *A, double *B, int *trigger) {
|
||||
;
|
||||
; for (int i=0; i<10000; i++) {
|
||||
; for (int i=0; i<10000; i += 16) {
|
||||
; if (trigger[i] < 100) {
|
||||
; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
|
||||
; }
|
||||
@ -410,7 +410,7 @@ if.end: ; preds = %if.then, %for.body
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%12 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %12, 1
|
||||
%inc = add nsw i32 %12, 16
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user