diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 4803a9d038c..4bb8c436564 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2635,7 +2635,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" " into one vector."); - + unsigned VF = MaxVectorSize; // If we optimize the program for size, avoid creating the tail loop. @@ -2697,17 +2697,23 @@ unsigned LoopVectorizationCostModel::getWidestType() { // For each instruction in the loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - if (Legal->isUniformAfterVectorization(it)) - continue; - Type *T = it->getType(); + // Only examine Loads, Stores and PHINodes. + if (!isa(it) && !isa(it) && !isa(it)) + continue; + + // Examine PHI nodes that are reduction variables. + if (PHINode *PN = dyn_cast(it)) + if (!Legal->getReductionVars()->count(PN)) + continue; + + // Examine the stored values. if (StoreInst *ST = dyn_cast(it)) T = ST->getValueOperand()->getType(); - // PHINodes and pointers are difficult to analyze, but we catch all other - // uses of the types in other instructions. - if (isa(it) || T->isPointerTy() || T->isVoidTy()) + // Ignore stored/loaded pointer types. + if (T->isPointerTy()) continue; MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits()); diff --git a/test/Transforms/LoopVectorize/ARM/width-detect.ll b/test/Transforms/LoopVectorize/ARM/width-detect.ll new file mode 100644 index 00000000000..c0795b6a79a --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/width-detect.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +;CHECK:foo_F64 +;CHECK: <2 x double> +;CHECK:ret +define double @foo_F64(double* nocapture %A, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %prod.01 = phi double [ %4, %.lr.ph ], [ 0.000000e+00, %0 ] + %2 = getelementptr inbounds double* %A, i64 %indvars.iv + %3 = load double* %2, align 8 + %4 = fmul fast double %prod.01, %3 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %prod.0.lcssa = phi double [ 0.000000e+00, %0 ], [ %4, %.lr.ph ] + ret double %prod.0.lcssa +} + +;CHECK:foo_I8 +;CHECK: xor <16 x i8> +;CHECK:ret +define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i8* %A, i64 %indvars.iv + %3 = load i8* %2, align 1 + %4 = xor i8 %3, %red.01 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ] + ret i8 %red.0.lcssa +} + +