llvm-mirror/test/Transforms/LoopStrengthReduce/X86/ivchain-stress-X86.ll

; REQUIRES: asserts
; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -stress-ivchain | FileCheck %s -check-prefix=X64
; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -stress-ivchain | FileCheck %s -check-prefix=X32

; @sharedidx is an unrolled variant of this loop:
;  for (unsigned long i = 0; i < len; i += s) {
;    c[i] = a[i] + b[i];
;  }
; where 's' cannot be folded into the addressing mode.
;
; This is not quite profitable to chain. But with -stress-ivchain, we
; can form three address chains in place of the shared induction
; variable.

; X64: sharedidx:
; X64: %for.body.preheader
; X64-NOT: leal ({{.*}},4)
; X64: %for.body.1

; X32: sharedidx:
; X32: %for.body.2
; X32: add
; X32: add
; X32: add
; X32: add
; X32: add
; X32: %for.body.3
define void @sharedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c, i32 %s, i32 %len) nounwind ssp {
entry:
  %cmp8 = icmp eq i32 %len, 0
  br i1 %cmp8, label %for.end, label %for.body

for.body:                                         ; preds = %entry, %for.body.3
  %i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds i8* %a, i32 %i.09
  %0 = load i8* %arrayidx, align 1
  %conv6 = zext i8 %0 to i32
  %arrayidx1 = getelementptr inbounds i8* %b, i32 %i.09
  %1 = load i8* %arrayidx1, align 1
  %conv27 = zext i8 %1 to i32
  %add = add nsw i32 %conv27, %conv6
  %conv3 = trunc i32 %add to i8
  %arrayidx4 = getelementptr inbounds i8* %c, i32 %i.09
  store i8 %conv3, i8* %arrayidx4, align 1
  %add5 = add i32 %i.09, %s
  %cmp = icmp ult i32 %add5, %len
  br i1 %cmp, label %for.body.1, label %for.end

for.end:                                          ; preds = %for.body, %for.body.1, %for.body.2, %for.body.3, %entry
  ret void

for.body.1:                                       ; preds = %for.body
  %arrayidx.1 = getelementptr inbounds i8* %a, i32 %add5
  %2 = load i8* %arrayidx.1, align 1
  %conv6.1 = zext i8 %2 to i32
  %arrayidx1.1 = getelementptr inbounds i8* %b, i32 %add5
  %3 = load i8* %arrayidx1.1, align 1
  %conv27.1 = zext i8 %3 to i32
  %add.1 = add nsw i32 %conv27.1, %conv6.1
  %conv3.1 = trunc i32 %add.1 to i8
  %arrayidx4.1 = getelementptr inbounds i8* %c, i32 %add5
  store i8 %conv3.1, i8* %arrayidx4.1, align 1
  %add5.1 = add i32 %add5, %s
  %cmp.1 = icmp ult i32 %add5.1, %len
  br i1 %cmp.1, label %for.body.2, label %for.end

for.body.2:                                       ; preds = %for.body.1
  %arrayidx.2 = getelementptr inbounds i8* %a, i32 %add5.1
  %4 = load i8* %arrayidx.2, align 1
  %conv6.2 = zext i8 %4 to i32
  %arrayidx1.2 = getelementptr inbounds i8* %b, i32 %add5.1
  %5 = load i8* %arrayidx1.2, align 1
  %conv27.2 = zext i8 %5 to i32
  %add.2 = add nsw i32 %conv27.2, %conv6.2
  %conv3.2 = trunc i32 %add.2 to i8
  %arrayidx4.2 = getelementptr inbounds i8* %c, i32 %add5.1
  store i8 %conv3.2, i8* %arrayidx4.2, align 1
  %add5.2 = add i32 %add5.1, %s
  %cmp.2 = icmp ult i32 %add5.2, %len
  br i1 %cmp.2, label %for.body.3, label %for.end

for.body.3:                                       ; preds = %for.body.2
  %arrayidx.3 = getelementptr inbounds i8* %a, i32 %add5.2
  %6 = load i8* %arrayidx.3, align 1
  %conv6.3 = zext i8 %6 to i32
  %arrayidx1.3 = getelementptr inbounds i8* %b, i32 %add5.2
  %7 = load i8* %arrayidx1.3, align 1
  %conv27.3 = zext i8 %7 to i32
  %add.3 = add nsw i32 %conv27.3, %conv6.3
  %conv3.3 = trunc i32 %add.3 to i8
  %arrayidx4.3 = getelementptr inbounds i8* %c, i32 %add5.2
  store i8 %conv3.3, i8* %arrayidx4.3, align 1
  %add5.3 = add i32 %add5.2, %s
  %cmp.3 = icmp ult i32 %add5.3, %len
  br i1 %cmp.3, label %for.body, label %for.end
}
Adding IV chain generation to LSR. After collecting chains, check if any should be materialized. If so, hide the chained IV users from the LSR solver. LSR will only solve for the head of the chain. GenerateIVChains will then materialize the chained IV users by computing the IV relative to its previous value in the chain. In theory, chained IV users could be exposed to LSR's solver. This would be considerably complicated to implement and I'm not aware of a case where we need it. In practice it's more important to intelligently prune the search space of nontrivial loops before running the solver, otherwise the solver is often forced to prune the most optimal solutions. Hiding the chained users does this well, so that LSR is more likely to find the best IV for the chain as a whole. llvm-svn: 147801 2012-01-09 22:18:52 +01:00			`; REQUIRES: asserts`
			`; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -stress-ivchain \| FileCheck %s -check-prefix=X64`
			`; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -stress-ivchain \| FileCheck %s -check-prefix=X32`

			`; @sharedidx is an unrolled variant of this loop:`
			`; for (unsigned long i = 0; i < len; i += s) {`
			`; c[i] = a[i] + b[i];`
			`; }`
			`; where 's' cannot be folded into the addressing mode.`
			`;`
			`; This is not quite profitable to chain. But with -stress-ivchain, we`
			`; can form three address chains in place of the shared induction`
			`; variable.`

			`; X64: sharedidx:`
			`; X64: %for.body.preheader`
			`; X64-NOT: leal ({{.*}},4)`
			`; X64: %for.body.1`

			`; X32: sharedidx:`
			`; X32: %for.body.2`
			`; X32: add`
			`; X32: add`
			`; X32: add`
			`; X32: add`
			`; X32: add`
			`; X32: %for.body.3`
			`define void @sharedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c, i32 %s, i32 %len) nounwind ssp {`
			`entry:`
			`%cmp8 = icmp eq i32 %len, 0`
			`br i1 %cmp8, label %for.end, label %for.body`

			`for.body: ; preds = %entry, %for.body.3`
			`%i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]`
			`%arrayidx = getelementptr inbounds i8* %a, i32 %i.09`
			`%0 = load i8* %arrayidx, align 1`
			`%conv6 = zext i8 %0 to i32`
			`%arrayidx1 = getelementptr inbounds i8* %b, i32 %i.09`
			`%1 = load i8* %arrayidx1, align 1`
			`%conv27 = zext i8 %1 to i32`
			`%add = add nsw i32 %conv27, %conv6`
			`%conv3 = trunc i32 %add to i8`
			`%arrayidx4 = getelementptr inbounds i8* %c, i32 %i.09`
			`store i8 %conv3, i8* %arrayidx4, align 1`
			`%add5 = add i32 %i.09, %s`
			`%cmp = icmp ult i32 %add5, %len`
			`br i1 %cmp, label %for.body.1, label %for.end`

			`for.end: ; preds = %for.body, %for.body.1, %for.body.2, %for.body.3, %entry`
			`ret void`

			`for.body.1: ; preds = %for.body`
			`%arrayidx.1 = getelementptr inbounds i8* %a, i32 %add5`
			`%2 = load i8* %arrayidx.1, align 1`
			`%conv6.1 = zext i8 %2 to i32`
			`%arrayidx1.1 = getelementptr inbounds i8* %b, i32 %add5`
			`%3 = load i8* %arrayidx1.1, align 1`
			`%conv27.1 = zext i8 %3 to i32`
			`%add.1 = add nsw i32 %conv27.1, %conv6.1`
			`%conv3.1 = trunc i32 %add.1 to i8`
			`%arrayidx4.1 = getelementptr inbounds i8* %c, i32 %add5`
			`store i8 %conv3.1, i8* %arrayidx4.1, align 1`
			`%add5.1 = add i32 %add5, %s`
			`%cmp.1 = icmp ult i32 %add5.1, %len`
			`br i1 %cmp.1, label %for.body.2, label %for.end`

			`for.body.2: ; preds = %for.body.1`
			`%arrayidx.2 = getelementptr inbounds i8* %a, i32 %add5.1`
			`%4 = load i8* %arrayidx.2, align 1`
			`%conv6.2 = zext i8 %4 to i32`
			`%arrayidx1.2 = getelementptr inbounds i8* %b, i32 %add5.1`
			`%5 = load i8* %arrayidx1.2, align 1`
			`%conv27.2 = zext i8 %5 to i32`
			`%add.2 = add nsw i32 %conv27.2, %conv6.2`
			`%conv3.2 = trunc i32 %add.2 to i8`
			`%arrayidx4.2 = getelementptr inbounds i8* %c, i32 %add5.1`
			`store i8 %conv3.2, i8* %arrayidx4.2, align 1`
			`%add5.2 = add i32 %add5.1, %s`
			`%cmp.2 = icmp ult i32 %add5.2, %len`
			`br i1 %cmp.2, label %for.body.3, label %for.end`

			`for.body.3: ; preds = %for.body.2`
			`%arrayidx.3 = getelementptr inbounds i8* %a, i32 %add5.2`
			`%6 = load i8* %arrayidx.3, align 1`
			`%conv6.3 = zext i8 %6 to i32`
			`%arrayidx1.3 = getelementptr inbounds i8* %b, i32 %add5.2`
			`%7 = load i8* %arrayidx1.3, align 1`
			`%conv27.3 = zext i8 %7 to i32`
			`%add.3 = add nsw i32 %conv27.3, %conv6.3`
			`%conv3.3 = trunc i32 %add.3 to i8`
			`%arrayidx4.3 = getelementptr inbounds i8* %c, i32 %add5.2`
			`store i8 %conv3.3, i8* %arrayidx4.3, align 1`
			`%add5.3 = add i32 %add5.2, %s`
			`%cmp.3 = icmp ult i32 %add5.3, %len`
			`br i1 %cmp.3, label %for.body, label %for.end`
			`}`