llvm-mirror/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll

; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF1
; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s | FileCheck %s --check-prefix=CHECKUF2

; CHECKUF1: for.body.preheader:
; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count

; CHECKUF1: vector.ph:
; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf

; CHECKUF1: vector.body:
; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]]
; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5


; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.

; CHECKUF2: for.body.preheader:
; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count

; CHECKUF2: vector.ph:
; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf

; CHECKUF2: vector.body:
; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]]
; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to <vscale x 4 x double>*
; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]]
; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to <vscale x 4 x double>*
; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]]
; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5

define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
entry:
  %cmp7 = icmp sgt i32 %N, 0
  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  %wide.trip.count = zext i32 %N to i64
  br label %for.body

for.cond.cleanup:                                 ; preds = %for.body, %entry
  ret void

for.body:                                         ; preds = %for.body.preheader, %for.body
  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
  %0 = load double, double* %arrayidx, align 8
  %add = fadd double %0, 1.000000e+00
  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
  store double %add, double* %arrayidx2, align 8
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
}

!1 = distinct !{!1, !2}
!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
[LV] Add -scalable-vectorization=<option> flag. This patch adds a new option to the LoopVectorizer to control how scalable vectors can be used. Initially, this suggests three levels to control scalable vectorization, although other more aggressive options can be added in the future. The possible options are: - Disabled: Disables vectorization with scalable vectors. - Enabled: Vectorize loops using scalable vectors or fixed-width vectors, but favors fixed-width vectors when the cost is a tie. - Preferred: Like 'Enabled', but favoring scalable vectors when the cost-model is inconclusive. Reviewed By: paulwalker-arm, vkmr Differential Revision: https://reviews.llvm.org/D101945 2021-04-08 13:19:44 +02:00			`; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s \| FileCheck %s --check-prefix=CHECKUF1`
			`; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 -force-vector-width=4 -force-target-supports-scalable-vectors=true -scalable-vectorization=on < %s \| FileCheck %s --check-prefix=CHECKUF2`
[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF. * Steps are scaled by `vscale`, a runtime value. * Changes to circumvent the cost-model for now (temporary) so that the cost-model can be implemented separately. This can vectorize the following loop [1]: void loop(int N, double a, double b) { #pragma clang loop vectorize_width(4, scalable) for (int i = 0; i < N; i++) { a[i] = b[i] + 1.0; } } [1] This source-level example is based on the pragma proposed separately in D89031. This patch only implements the LLVM part. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D91077 2020-12-08 15:20:04 +01:00
			`; CHECKUF1: for.body.preheader:`
			`; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64`
			`; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()`
			`; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2`
			`; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count`

			`; CHECKUF1: vector.ph:`
			`; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()`
			`; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2`
			`; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]`
			`; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf`

			`; CHECKUF1: vector.body:`
			`; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]`
			`; CHECKUF1: %[[IDXB:.]] = getelementptr inbounds double, double %b, i64 %index`
			`; CHECKUF1: %[[IDXB_CAST:.]] = bitcast double %[[IDXB]] to <vscale x 4 x double>*`
			`; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0`
[IR] Let IRBuilder's CreateVectorSplat/CreateShuffleVector use poison as placeholder This patch updates IRBuilder to create insertelement/shufflevector using poison as a placeholder. Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D93793 2020-12-24 01:33:58 +01:00			`; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)`
[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF. * Steps are scaled by `vscale`, a runtime value. * Changes to circumvent the cost-model for now (temporary) so that the cost-model can be implemented separately. This can vectorize the following loop [1]: void loop(int N, double a, double b) { #pragma clang loop vectorize_width(4, scalable) for (int i = 0; i < N; i++) { a[i] = b[i] + 1.0; } } [1] This source-level example is based on the pragma proposed separately in D89031. This patch only implements the LLVM part. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D91077 2020-12-08 15:20:04 +01:00			`; CHECKUF1: %[[IDXA:.]] = getelementptr inbounds double, double %a, i64 %index`
			`; CHECKUF1: %[[IDXA_CAST:.]] = bitcast double %[[IDXA]] to <vscale x 4 x double>*`
			`; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0`
			`; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()`
			`; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2`
[LV] Mark increment of main vector loop induction variable as NUW. This patch marks the induction increment of the main induction variable of the vector loop as NUW when not folding the tail. If the tail is not folded, we know that End - Start >= Step (either statically or through the minimum iteration checks). We also know that both Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + %Step == %End. Hence we must exit the loop before %IV + %Step unsigned overflows and we can mark the induction increment as NUW. This should make SCEV return more precise bounds for the created vector loops, used by later optimizations, like late unrolling. At the moment quite a few tests still need to be updated, but before doing so I'd like to get initial feedback to make sure I am not missing anything. Note that this could probably be further improved by using information from the original IV. Attempt of modeling of the assumption in Alive2: https://alive2.llvm.org/ce/z/H_DL_g Part of a set of fixes required for PR50412. Reviewed By: mkazantsev Differential Revision: https://reviews.llvm.org/D103255 2021-06-07 10:24:27 +02:00			`; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]]`
[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF. * Steps are scaled by `vscale`, a runtime value. * Changes to circumvent the cost-model for now (temporary) so that the cost-model can be implemented separately. This can vectorize the following loop [1]: void loop(int N, double a, double b) { #pragma clang loop vectorize_width(4, scalable) for (int i = 0; i < N; i++) { a[i] = b[i] + 1.0; } } [1] This source-level example is based on the pragma proposed separately in D89031. This patch only implements the LLVM part. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D91077 2020-12-08 15:20:04 +01:00			`; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec`
			`; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5`


			`; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).`
			`; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.`

			`; CHECKUF2: for.body.preheader:`
			`; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64`
			`; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()`
			`; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3`
			`; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count`

			`; CHECKUF2: vector.ph:`
			`; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()`
			`; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3`
			`; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]`
			`; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf`

			`; CHECKUF2: vector.body:`
			`; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]`
			`; CHECKUF2: %[[IDXB:.]] = getelementptr inbounds double, double %b, i64 %index`
			`; CHECKUF2: %[[IDXB_CAST:.]] = bitcast double %[[IDXB]] to <vscale x 4 x double>*`
			`; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0`
			`; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()`
			`; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2`
			`; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64`
			`; CHECKUF2: %[[IDXB_NEXT:.]] = getelementptr inbounds double, double %[[IDXB]], i64 %[[VSCALE2_EXT]]`
			`; CHECKUF2: %[[IDXB_NEXT_CAST:.]] = bitcast double %[[IDXB_NEXT]] to <vscale x 4 x double>*`
			`; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0`
[IR] Let IRBuilder's CreateVectorSplat/CreateShuffleVector use poison as placeholder This patch updates IRBuilder to create insertelement/shufflevector using poison as a placeholder. Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D93793 2020-12-24 01:33:58 +01:00			`; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)`
			`; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)`
[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF. * Steps are scaled by `vscale`, a runtime value. * Changes to circumvent the cost-model for now (temporary) so that the cost-model can be implemented separately. This can vectorize the following loop [1]: void loop(int N, double a, double b) { #pragma clang loop vectorize_width(4, scalable) for (int i = 0; i < N; i++) { a[i] = b[i] + 1.0; } } [1] This source-level example is based on the pragma proposed separately in D89031. This patch only implements the LLVM part. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D91077 2020-12-08 15:20:04 +01:00			`; CHECKUF2: %[[IDXA:.]] = getelementptr inbounds double, double %a, i64 %index`
			`; CHECKUF2: %[[IDXA_CAST:.]] = bitcast double %[[IDXA]] to <vscale x 4 x double>*`
			`; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0`
			`; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()`
			`; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2`
			`; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64`
			`; CHECKUF2: %[[IDXA_NEXT:.]] = getelementptr inbounds double, double %[[IDXA]], i64 %[[VSCALE2_EXT]]`
			`; CHECKUF2: %[[IDXA_NEXT_CAST:.]] = bitcast double %[[IDXA_NEXT]] to <vscale x 4 x double>*`
			`; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0`
			`; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()`
			`; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3`
[LV] Mark increment of main vector loop induction variable as NUW. This patch marks the induction increment of the main induction variable of the vector loop as NUW when not folding the tail. If the tail is not folded, we know that End - Start >= Step (either statically or through the minimum iteration checks). We also know that both Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV + %Step == %End. Hence we must exit the loop before %IV + %Step unsigned overflows and we can mark the induction increment as NUW. This should make SCEV return more precise bounds for the created vector loops, used by later optimizations, like late unrolling. At the moment quite a few tests still need to be updated, but before doing so I'd like to get initial feedback to make sure I am not missing anything. Note that this could probably be further improved by using information from the original IV. Attempt of modeling of the assumption in Alive2: https://alive2.llvm.org/ce/z/H_DL_g Part of a set of fixes required for PR50412. Reviewed By: mkazantsev Differential Revision: https://reviews.llvm.org/D103255 2021-06-07 10:24:27 +02:00			`; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]]`
[LoopVectorizer][SVE] Vectorize a simple loop with with a scalable VF. * Steps are scaled by `vscale`, a runtime value. * Changes to circumvent the cost-model for now (temporary) so that the cost-model can be implemented separately. This can vectorize the following loop [1]: void loop(int N, double a, double b) { #pragma clang loop vectorize_width(4, scalable) for (int i = 0; i < N; i++) { a[i] = b[i] + 1.0; } } [1] This source-level example is based on the pragma proposed separately in D89031. This patch only implements the LLVM part. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D91077 2020-12-08 15:20:04 +01:00			`; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec`
			`; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5`

			`define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {`
			`entry:`
			`%cmp7 = icmp sgt i32 %N, 0`
			`br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup`

			`for.body.preheader: ; preds = %entry`
			`%wide.trip.count = zext i32 %N to i64`
			`br label %for.body`

			`for.cond.cleanup: ; preds = %for.body, %entry`
			`ret void`

			`for.body: ; preds = %for.body.preheader, %for.body`
			`%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]`
			`%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv`
			`%0 = load double, double* %arrayidx, align 8`
			`%add = fadd double %0, 1.000000e+00`
			`%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv`
			`store double %add, double* %arrayidx2, align 8`
			`%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1`
			`%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count`
			`br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1`
			`}`

[LV] Legalize scalable VF hints In the following loop: void foo(int a, int b, int N) { for (int i=0; i<N; ++i) a[i + 4] = a[i] + b[i]; } The loop dependence constrains the VF to a maximum of (4, fixed), which would mean using <4 x i32> as the vector type in vectorization. Extending this to scalable vectorization, a VF of (4, scalable) implies a vector type of <vscale x 4 x i32>. To determine if this is legal vscale must be taken into account. For this example, unless max(vscale)=1, it's unsafe to vectorize. For SVE, the number of bits in an SVE register is architecturally defined to be a multiple of 128 bits with a maximum of 2048 bits, thus the maximum vscale is 16. In the loop above it is therefore unfeasible to vectorize with SVE. However, in this loop: void foo(int a, int b, int N) { #pragma clang loop vectorize_width(X, scalable) for (int i=0; i<N; ++i) a[i + 32] = a[i] + b[i]; } As long as max(vscale) multiplied by the number of lanes 'X' doesn't exceed the dependence distance, it is safe to vectorize. For SVE a VF of (2, scalable) is within this constraint, since a vector of <16 x 2 x 32> will have no dependencies between lanes. For any number of lanes larger than this it would be unsafe to vectorize. This patch extends 'computeFeasibleMaxVF' to legalize scalable VFs specified as loop hints, implementing the following behaviour: * If the backend does not support scalable vectors, ignore the hint. * If scalable vectorization is unfeasible given the loop dependence, like in the first example above for SVE, then use a fixed VF. * Accept scalable VFs if it's safe to do so. * Otherwise, clamp scalable VFs that exceed the maximum safe VF. Reviewed By: sdesmalen, fhahn, david-arm Differential Revision: https://reviews.llvm.org/D91718 2020-11-16 12:02:14 +01:00			`!1 = distinct !{!1, !2}`
			`!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}`