From fbe1366788870f9d58225b897cb8240ec99e3671 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 3 Sep 2014 17:40:30 +0000 Subject: [PATCH] Preserve IR flags (nsw, nuw, exact, fast-math) in SLP vectorizer (PR20802). The SLP vectorizer should propagate IR-level optimization hints/flags (nsw, nuw, exact, fast-math) when converting scalar instructions into vectors. But this isn't a simple copy - we need to take the intersection (the logical 'and') of the sets of flags on the scalars. The solution is further complicated because we can have non-uniform (non-SIMD) vector ops after: http://reviews.llvm.org/D4015 http://llvm.org/viewvc/llvm-project?view=revision&revision=211339 The vast majority of changed files are existing tests that were not propagating IR flags, but I've also added a new test file for focused testing of IR flag possibilities. Differential Revision: http://reviews.llvm.org/D5172 llvm-svn: 217051 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 35 +- .../SLPVectorizer/AArch64/commute.ll | 8 +- test/Transforms/SLPVectorizer/ARM/sroa.ll | 4 +- test/Transforms/SLPVectorizer/X86/addsub.ll | 12 +- .../Transforms/SLPVectorizer/X86/cycle_dup.ll | 2 +- test/Transforms/SLPVectorizer/X86/hoist.ll | 2 +- .../SLPVectorizer/X86/horizontal.ll | 8 +- .../SLPVectorizer/X86/loopinvariant.ll | 4 +- .../SLPVectorizer/X86/multi_user.ll | 2 +- .../Transforms/SLPVectorizer/X86/powof2div.ll | 2 +- .../SLPVectorizer/X86/propagate_ir_flags.ll | 350 ++++++++++++++++++ test/Transforms/SLPVectorizer/X86/saxpy.ll | 2 +- .../SLPVectorizer/X86/scheduling.ll | 2 +- 13 files changed, 404 insertions(+), 29 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 65987626edd..76b46779535 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -166,6 +166,23 @@ static unsigned getSameOpcode(ArrayRef VL) { return Opcode; } +/// Get the intersection (logical and) of all of the potential IR flags +/// of each scalar operation (VL) that will be converted into a vector (I). +/// Flag set: NSW, NUW, exact, and all of fast-math. +static void propagateIRFlags(Value *I, ArrayRef VL) { + if (auto *VecOp = dyn_cast(I)) { + if (auto *Intersection = dyn_cast(VL[0])) { + // Intersection is initialized to the 0th scalar, + // so start counting from index '1'. + for (int i = 1, e = VL.size(); i < e; ++i) { + if (auto *Scalar = dyn_cast(VL[i])) + Intersection->andIRFlags(Scalar); + } + VecOp->copyIRFlags(Intersection); + } + } +} + /// \returns \p I after propagating metadata from \p VL. static Instruction *propagateMetadata(Instruction *I, ArrayRef VL) { Instruction *I0 = cast(VL[0]); @@ -2031,6 +2048,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { BinaryOperator *BinOp = cast(VL0); Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); E->VectorizedValue = V; + propagateIRFlags(E->VectorizedValue, E->Scalars); ++NumVectorInstructions; if (Instruction *I = dyn_cast(V)) @@ -2194,18 +2212,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { BinaryOperator *BinOp1 = cast(VL1); Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); - // Create appropriate shuffle to take alternative operations from - // the vector. - std::vector Mask(E->Scalars.size()); + // Create shuffle to take alternate operations from the vector. + // Also, gather up odd and even scalar ops to propagate IR flags to + // each vector operation. + ValueList OddScalars, EvenScalars; unsigned e = E->Scalars.size(); + SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { - if (i & 1) + if (i & 1) { Mask[i] = Builder.getInt32(e + i); - else + OddScalars.push_back(E->Scalars[i]); + } else { Mask[i] = Builder.getInt32(i); + EvenScalars.push_back(E->Scalars[i]); + } } Value *ShuffleMask = ConstantVector::get(Mask); + propagateIRFlags(V0, EvenScalars); + propagateIRFlags(V1, OddScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); E->VectorizedValue = V; diff --git a/test/Transforms/SLPVectorizer/AArch64/commute.ll b/test/Transforms/SLPVectorizer/AArch64/commute.ll index d1300eec666..4ee91a5ed4c 100644 --- a/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ b/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -10,8 +10,8 @@ define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK: %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1 ; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>* ; CHECK: %4 = load <2 x float>* %3, align 4 -; CHECK: %5 = fsub <2 x float> %2, %4 -; CHECK: %6 = fmul <2 x float> %5, %5 +; CHECK: %5 = fsub fast <2 x float> %2, %4 +; CHECK: %6 = fmul fast <2 x float> %5, %5 ; CHECK: %7 = extractelement <2 x float> %6, i32 0 ; CHECK: %8 = extractelement <2 x float> %6, i32 1 ; CHECK: %add = fadd fast float %7, %8 @@ -45,8 +45,8 @@ define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) { ; CHECK: %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1 ; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>* ; CHECK: %4 = load <2 x float>* %3, align 4 -; CHECK: %5 = fsub <2 x float> %2, %4 -; CHECK: %6 = fmul <2 x float> %5, %5 +; CHECK: %5 = fsub fast <2 x float> %2, %4 +; CHECK: %6 = fmul fast <2 x float> %5, %5 ; CHECK: %7 = extractelement <2 x float> %6, i32 0 ; CHECK: %8 = extractelement <2 x float> %6, i32 1 ; CHECK: %add = fadd fast float %8, %7 diff --git a/test/Transforms/SLPVectorizer/ARM/sroa.ll b/test/Transforms/SLPVectorizer/ARM/sroa.ll index e0c75b147f6..899cfb1f82c 100644 --- a/test/Transforms/SLPVectorizer/ARM/sroa.ll +++ b/test/Transforms/SLPVectorizer/ARM/sroa.ll @@ -5,11 +5,11 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64- %class.Complex = type { double, double } ; Code like this is the result of SROA. Make sure we don't vectorize this -; because the in the scalar version of this the shl/or are handled by the +; because the scalar version of the shl/or are handled by the ; backend and disappear, the vectorized code stays. ; CHECK-LABEL: SROAed -; CHECK-NOT: shl <2 x i64> +; CHECK-NOT: shl nuw <2 x i64> ; CHECK-NOT: or <2 x i64> define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) { diff --git a/test/Transforms/SLPVectorizer/X86/addsub.ll b/test/Transforms/SLPVectorizer/X86/addsub.ll index 8303bc8181e..174d4004684 100644 --- a/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -12,9 +12,9 @@ target triple = "x86_64-unknown-linux-gnu" @fa = common global [4 x float] zeroinitializer, align 16 ; CHECK-LABEL: @addsub -; CHECK: %5 = add <4 x i32> %3, %4 -; CHECK: %6 = add <4 x i32> %2, %5 -; CHECK: %7 = sub <4 x i32> %2, %5 +; CHECK: %5 = add nsw <4 x i32> %3, %4 +; CHECK: %6 = add nsw <4 x i32> %2, %5 +; CHECK: %7 = sub nsw <4 x i32> %2, %5 ; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> ; Function Attrs: nounwind uwtable @@ -56,9 +56,9 @@ entry: } ; CHECK-LABEL: @subadd -; CHECK: %5 = add <4 x i32> %3, %4 -; CHECK: %6 = sub <4 x i32> %2, %5 -; CHECK: %7 = add <4 x i32> %2, %5 +; CHECK: %5 = add nsw <4 x i32> %3, %4 +; CHECK: %6 = sub nsw <4 x i32> %2, %5 +; CHECK: %7 = add nsw <4 x i32> %2, %5 ; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> ; Function Attrs: nounwind uwtable diff --git a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll index fba35499fb7..bac2c3c0df3 100644 --- a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll +++ b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll @@ -15,7 +15,7 @@ target triple = "x86_64-apple-macosx10.9.0" ;CHECK: bitcast i32* %A to <4 x i32>* ;CHECK-NEXT: load <4 x i32> ;CHECK: phi <4 x i32> -;CHECK-NEXT: mul <4 x i32> +;CHECK-NEXT: mul nsw <4 x i32> ;CHECK-NOT: mul ;CHECK: phi <4 x i32> ;CHECK: bitcast i32* %A to <4 x i32>* diff --git a/test/Transforms/SLPVectorizer/X86/hoist.ll b/test/Transforms/SLPVectorizer/X86/hoist.ll index 5074ceaaabd..78c58f1d84e 100644 --- a/test/Transforms/SLPVectorizer/X86/hoist.ll +++ b/test/Transforms/SLPVectorizer/X86/hoist.ll @@ -21,7 +21,7 @@ target triple = "i386-apple-macosx10.9.0" ; loop body: ;CHECK: phi ;CHECK: load <4 x i32> -;CHECK: add <4 x i32> +;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) { diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll index 8f919512ff8..18360471c19 100644 --- a/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -148,7 +148,7 @@ for.end: ; } ; CHECK-LABEL: long_red -; CHECK: fmul <4 x float> +; CHECK: fmul fast <4 x float> ; CHECK: shufflevector <4 x float> define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) { @@ -250,7 +250,7 @@ for.end: ; } ; CHECK-LABEL: chain_red -; CHECK: fmul <4 x float> +; CHECK: fmul fast <4 x float> ; CHECK: shufflevector <4 x float> define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) { @@ -317,7 +317,7 @@ for.end: ; } ; CHECK-LABEL: store_red -; CHECK: fmul <4 x float> +; CHECK: fmul fast <4 x float> ; CHECK: shufflevector <4 x float> define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) { @@ -379,7 +379,7 @@ for.end: ; } ; STORE-LABEL: store_red_double -; STORE: fmul <2 x double> +; STORE: fmul fast <2 x double> ; STORE: extractelement <2 x double> ; STORE: extractelement <2 x double> diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll index aef2479dd52..bc12926e3fe 100644 --- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll +++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll @@ -5,10 +5,10 @@ target triple = "x86_64-apple-macosx10.8.0" ;CHECK-LABEL: @foo( ;CHECK: load <4 x i32> -;CHECK: add <4 x i32> +;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: load <4 x i32> -;CHECK: add <4 x i32> +;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret define i32 @foo(i32* nocapture %A, i32 %n) #0 { diff --git a/test/Transforms/SLPVectorizer/X86/multi_user.ll b/test/Transforms/SLPVectorizer/X86/multi_user.ll index cab99945e29..63a77e4b673 100644 --- a/test/Transforms/SLPVectorizer/X86/multi_user.ll +++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll @@ -14,7 +14,7 @@ target triple = "x86_64-apple-macosx10.7.0" ;CHECK-LABEL: @foo( ;CHECK: insertelement <4 x i32> ;CHECK: load <4 x i32> -;CHECK: add <4 x i32> +;CHECK: add nsw <4 x i32> ;CHECK: store <4 x i32> ;CHECK: ret define i32 @foo(i32* nocapture %A, i32 %n) { diff --git a/test/Transforms/SLPVectorizer/X86/powof2div.ll b/test/Transforms/SLPVectorizer/X86/powof2div.ll index b82cd4dd236..7aa1efde6f0 100644 --- a/test/Transforms/SLPVectorizer/X86/powof2div.ll +++ b/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu" ;CHECK-LABEL: @powof2div( ;CHECK: load <4 x i32>* -;CHECK: add <4 x i32> +;CHECK: add nsw <4 x i32> ;CHECK: sdiv <4 x i32> define void @powof2div(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ entry: diff --git a/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll new file mode 100644 index 00000000000..3843ef7f62c --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll @@ -0,0 +1,350 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s + +; Check propagation of optional IR flags (PR20802). For a flag to +; propagate from scalar instructions to their vector replacement, +; *all* scalar instructions must have the flag. + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; CHECK-LABEL: @exact( +; CHECK: lshr exact <4 x i32> +define void @exact(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = lshr exact i32 %load1, 1 + %op2 = lshr exact i32 %load2, 1 + %op3 = lshr exact i32 %load3, 1 + %op4 = lshr exact i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @not_exact( +; CHECK: lshr <4 x i32> +define void @not_exact(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = lshr exact i32 %load1, 1 + %op2 = lshr i32 %load2, 1 + %op3 = lshr exact i32 %load3, 1 + %op4 = lshr exact i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @nsw( +; CHECK: add nsw <4 x i32> +define void @nsw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add nsw i32 %load1, 1 + %op2 = add nsw i32 %load2, 1 + %op3 = add nsw i32 %load3, 1 + %op4 = add nsw i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @not_nsw( +; CHECK: add <4 x i32> +define void @not_nsw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add nsw i32 %load1, 1 + %op2 = add nsw i32 %load2, 1 + %op3 = add nsw i32 %load3, 1 + %op4 = add i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @nuw( +; CHECK: add nuw <4 x i32> +define void @nuw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add nuw i32 %load1, 1 + %op2 = add nuw i32 %load2, 1 + %op3 = add nuw i32 %load3, 1 + %op4 = add nuw i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @not_nuw( +; CHECK: add <4 x i32> +define void @not_nuw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add nuw i32 %load1, 1 + %op2 = add i32 %load2, 1 + %op3 = add i32 %load3, 1 + %op4 = add nuw i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @nnan( +; CHECK: fadd nnan <4 x float> +define void @nnan(float* %x) { + %idx1 = getelementptr inbounds float* %x, i64 0 + %idx2 = getelementptr inbounds float* %x, i64 1 + %idx3 = getelementptr inbounds float* %x, i64 2 + %idx4 = getelementptr inbounds float* %x, i64 3 + + %load1 = load float* %idx1, align 4 + %load2 = load float* %idx2, align 4 + %load3 = load float* %idx3, align 4 + %load4 = load float* %idx4, align 4 + + %op1 = fadd fast nnan float %load1, 1.0 + %op2 = fadd nnan ninf float %load2, 1.0 + %op3 = fadd nsz nnan float %load3, 1.0 + %op4 = fadd arcp nnan float %load4, 1.0 + + store float %op1, float* %idx1, align 4 + store float %op2, float* %idx2, align 4 + store float %op3, float* %idx3, align 4 + store float %op4, float* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @not_nnan( +; CHECK: fadd <4 x float> +define void @not_nnan(float* %x) { + %idx1 = getelementptr inbounds float* %x, i64 0 + %idx2 = getelementptr inbounds float* %x, i64 1 + %idx3 = getelementptr inbounds float* %x, i64 2 + %idx4 = getelementptr inbounds float* %x, i64 3 + + %load1 = load float* %idx1, align 4 + %load2 = load float* %idx2, align 4 + %load3 = load float* %idx3, align 4 + %load4 = load float* %idx4, align 4 + + %op1 = fadd nnan float %load1, 1.0 + %op2 = fadd ninf float %load2, 1.0 + %op3 = fadd nsz float %load3, 1.0 + %op4 = fadd arcp float %load4, 1.0 + + store float %op1, float* %idx1, align 4 + store float %op2, float* %idx2, align 4 + store float %op3, float* %idx3, align 4 + store float %op4, float* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @only_fast( +; CHECK: fadd fast <4 x float> +define void @only_fast(float* %x) { + %idx1 = getelementptr inbounds float* %x, i64 0 + %idx2 = getelementptr inbounds float* %x, i64 1 + %idx3 = getelementptr inbounds float* %x, i64 2 + %idx4 = getelementptr inbounds float* %x, i64 3 + + %load1 = load float* %idx1, align 4 + %load2 = load float* %idx2, align 4 + %load3 = load float* %idx3, align 4 + %load4 = load float* %idx4, align 4 + + %op1 = fadd fast nnan float %load1, 1.0 + %op2 = fadd fast nnan ninf float %load2, 1.0 + %op3 = fadd fast nsz nnan float %load3, 1.0 + %op4 = fadd arcp nnan fast float %load4, 1.0 + + store float %op1, float* %idx1, align 4 + store float %op2, float* %idx2, align 4 + store float %op3, float* %idx3, align 4 + store float %op4, float* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @only_arcp( +; CHECK: fadd arcp <4 x float> +define void @only_arcp(float* %x) { + %idx1 = getelementptr inbounds float* %x, i64 0 + %idx2 = getelementptr inbounds float* %x, i64 1 + %idx3 = getelementptr inbounds float* %x, i64 2 + %idx4 = getelementptr inbounds float* %x, i64 3 + + %load1 = load float* %idx1, align 4 + %load2 = load float* %idx2, align 4 + %load3 = load float* %idx3, align 4 + %load4 = load float* %idx4, align 4 + + %op1 = fadd fast float %load1, 1.0 + %op2 = fadd fast float %load2, 1.0 + %op3 = fadd fast float %load3, 1.0 + %op4 = fadd arcp float %load4, 1.0 + + store float %op1, float* %idx1, align 4 + store float %op2, float* %idx2, align 4 + store float %op3, float* %idx3, align 4 + store float %op4, float* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @addsub_all_nsw +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> +define void @addsub_all_nsw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add nsw i32 %load1, 1 + %op2 = sub nsw i32 %load2, 1 + %op3 = add nsw i32 %load3, 1 + %op4 = sub nsw i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @addsub_some_nsw +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +define void @addsub_some_nsw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add nsw i32 %load1, 1 + %op2 = sub nsw i32 %load2, 1 + %op3 = add nsw i32 %load3, 1 + %op4 = sub i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + +; CHECK-LABEL: @addsub_no_nsw +; CHECK: add <4 x i32> +; CHECK: sub <4 x i32> +define void @addsub_no_nsw(i32* %x) { + %idx1 = getelementptr inbounds i32* %x, i64 0 + %idx2 = getelementptr inbounds i32* %x, i64 1 + %idx3 = getelementptr inbounds i32* %x, i64 2 + %idx4 = getelementptr inbounds i32* %x, i64 3 + + %load1 = load i32* %idx1, align 4 + %load2 = load i32* %idx2, align 4 + %load3 = load i32* %idx3, align 4 + %load4 = load i32* %idx4, align 4 + + %op1 = add i32 %load1, 1 + %op2 = sub nsw i32 %load2, 1 + %op3 = add nsw i32 %load3, 1 + %op4 = sub i32 %load4, 1 + + store i32 %op1, i32* %idx1, align 4 + store i32 %op2, i32* %idx2, align 4 + store i32 %op3, i32* %idx3, align 4 + store i32 %op4, i32* %idx4, align 4 + + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll index 46263416a90..4b39d46f89d 100644 --- a/test/Transforms/SLPVectorizer/X86/saxpy.ll +++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll @@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.8.0" ; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf ;CHECK: SAXPY -;CHECK: mul <4 x i32> +;CHECK: mul nsw <4 x i32> ;CHECK: ret define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) { diff --git a/test/Transforms/SLPVectorizer/X86/scheduling.ll b/test/Transforms/SLPVectorizer/X86/scheduling.ll index 2ff32739439..3b3bd804e05 100644 --- a/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -6,7 +6,7 @@ target triple = "x86_64-apple-macosx10.9.0" ;CHECK-LABEL: @foo ;CHECK: load <4 x i32> ;CHECK: load <4 x i32> -;CHECK: %[[S1:.+]] = add <4 x i32> +;CHECK: %[[S1:.+]] = add nsw <4 x i32> ;CHECK-DAG: store <4 x i32> %[[S1]] ;CHECK-DAG: %[[A1:.+]] = add nsw i32 ;CHECK-DAG: %[[A2:.+]] = add nsw i32 %[[A1]]