llvm-mirror/test/CodeGen/X86/fold-and-shift.ll

; RUN: llc < %s -march=x86 | FileCheck %s

define i32 @t1(i8* %X, i32 %i) {
; CHECK: t1:
; CHECK-NOT: and
; CHECK: movzbl
; CHECK: movl (%{{...}},%{{...}},4),
; CHECK: ret

entry:
  %tmp2 = shl i32 %i, 2
  %tmp4 = and i32 %tmp2, 1020
  %tmp7 = getelementptr i8* %X, i32 %tmp4
  %tmp78 = bitcast i8* %tmp7 to i32*
  %tmp9 = load i32* %tmp78
  ret i32 %tmp9
}

define i32 @t2(i16* %X, i32 %i) {
; CHECK: t2:
; CHECK-NOT: and
; CHECK: movzwl
; CHECK: movl (%{{...}},%{{...}},4),
; CHECK: ret

entry:
  %tmp2 = shl i32 %i, 1
  %tmp4 = and i32 %tmp2, 131070
  %tmp7 = getelementptr i16* %X, i32 %tmp4
  %tmp78 = bitcast i16* %tmp7 to i32*
  %tmp9 = load i32* %tmp78
  ret i32 %tmp9
}

define i32 @t3(i16* %i.ptr, i32* %arr) {
; This case is tricky. The lshr followed by a gep will produce a lshr followed
; by an and to remove the low bits. This can be simplified by doing the lshr by
; a greater constant and using the addressing mode to scale the result back up.
; To make matters worse, because of the two-phase zext of %i and their reuse in
; the function, the DAG can get confusing trying to re-use both of them and
; prevent easy analysis of the mask in order to match this.
; CHECK: t3:
; CHECK-NOT: and
; CHECK: shrl
; CHECK: addl (%{{...}},%{{...}},4),
; CHECK: ret

entry:
  %i = load i16* %i.ptr
  %i.zext = zext i16 %i to i32
  %index = lshr i32 %i.zext, 11
  %val.ptr = getelementptr inbounds i32* %arr, i32 %index
  %val = load i32* %val.ptr
  %sum = add i32 %val, %i.zext
  ret i32 %sum
}

define i32 @t4(i16* %i.ptr, i32* %arr) {
; A version of @t3 that has more zero extends and more re-use of intermediate
; values. This exercise slightly different bits of canonicalization.
; CHECK: t4:
; CHECK-NOT: and
; CHECK: shrl
; CHECK: addl (%{{...}},%{{...}},4),
; CHECK: ret

entry:
  %i = load i16* %i.ptr
  %i.zext = zext i16 %i to i32
  %index = lshr i32 %i.zext, 11
  %index.zext = zext i32 %index to i64
  %val.ptr = getelementptr inbounds i32* %arr, i64 %index.zext
  %val = load i32* %val.ptr
  %sum.1 = add i32 %val, %i.zext
  %sum.2 = add i32 %sum.1, %index
  ret i32 %sum.2
}
Cleanup and FileCheck-ize a test. llvm-svn: 147772 2012-01-09 10:44:26 +01:00			`; RUN: llc < %s -march=x86 \| FileCheck %s`
Fold some and + shift in x86 addressing mode. llvm-svn: 44970 2007-12-13 01:43:27 +01:00
			`define i32 @t1(i8* %X, i32 %i) {`
Cleanup and FileCheck-ize a test. llvm-svn: 147772 2012-01-09 10:44:26 +01:00			`; CHECK: t1:`
			`; CHECK-NOT: and`
			`; CHECK: movzbl`
			`; CHECK: movl (%{{...}},%{{...}},4),`
			`; CHECK: ret`

Fold some and + shift in x86 addressing mode. llvm-svn: 44970 2007-12-13 01:43:27 +01:00			`entry:`
Cleanup and FileCheck-ize a test. llvm-svn: 147772 2012-01-09 10:44:26 +01:00			`%tmp2 = shl i32 %i, 2`
			`%tmp4 = and i32 %tmp2, 1020`
			`%tmp7 = getelementptr i8* %X, i32 %tmp4`
			`%tmp78 = bitcast i8* %tmp7 to i32*`
			`%tmp9 = load i32* %tmp78`
			`ret i32 %tmp9`
Fold some and + shift in x86 addressing mode. llvm-svn: 44970 2007-12-13 01:43:27 +01:00			`}`

			`define i32 @t2(i16* %X, i32 %i) {`
Cleanup and FileCheck-ize a test. llvm-svn: 147772 2012-01-09 10:44:26 +01:00			`; CHECK: t2:`
			`; CHECK-NOT: and`
			`; CHECK: movzwl`
			`; CHECK: movl (%{{...}},%{{...}},4),`
			`; CHECK: ret`

Fold some and + shift in x86 addressing mode. llvm-svn: 44970 2007-12-13 01:43:27 +01:00			`entry:`
Cleanup and FileCheck-ize a test. llvm-svn: 147772 2012-01-09 10:44:26 +01:00			`%tmp2 = shl i32 %i, 1`
			`%tmp4 = and i32 %tmp2, 131070`
			`%tmp7 = getelementptr i16* %X, i32 %tmp4`
			`%tmp78 = bitcast i16* %tmp7 to i32*`
			`%tmp9 = load i32* %tmp78`
			`ret i32 %tmp9`
Fold some and + shift in x86 addressing mode. llvm-svn: 44970 2007-12-13 01:43:27 +01:00			`}`
Teach the X86 instruction selection to do some heroic transforms to detect a pattern which can be implemented with a small 'shl' embedded in the addressing mode scale. This happens in real code as follows: unsigned x = my_accelerator_table[input >> 11]; Here we have some lookup table that we look into using the high bits of 'input'. Each entity in the table is 4-bytes, which means this implicitly gets turned into (once lowered out of a GEP): (unsigned)((char)my_accelerator_table + ((input >> 11) << 2)); The shift right followed by a shift left is canonicalized to a smaller shift right and masking off the low bits. That hides the shift right which x86 has an addressing mode designed to support. We now detect masks of this form, and produce the longer shift right followed by the proper addressing mode. In addition to saving a (rather large) instruction, this also reduces stalls in Intel chips on benchmarks I've measured. In order for all of this to work, one part of the DAG needs to be canonicalized still further* than it currently is. This involves removing pointless 'trunc' nodes between a zextload and a zext. Without that, we end up generating spurious masks and hiding the pattern. llvm-svn: 147936 2012-01-11 09:41:08 +01:00
			`define i32 @t3(i16* %i.ptr, i32* %arr) {`
			`; This case is tricky. The lshr followed by a gep will produce a lshr followed`
			`; by an and to remove the low bits. This can be simplified by doing the lshr by`
			`; a greater constant and using the addressing mode to scale the result back up.`
			`; To make matters worse, because of the two-phase zext of %i and their reuse in`
			`; the function, the DAG can get confusing trying to re-use both of them and`
			`; prevent easy analysis of the mask in order to match this.`
Revert r147945 which disabled an addressing mode transformation. I had hoped this would revive one of the llvm-gcc selfhost build bots, but it didn't so it doesn't appear that my transform is the culprit. If anyone else is seeing failures, please let me know! llvm-svn: 147957 2012-01-11 19:36:12 +01:00			`; CHECK: t3:`
			`; CHECK-NOT: and`
			`; CHECK: shrl`
			`; CHECK: addl (%{{...}},%{{...}},4),`
			`; CHECK: ret`
Teach the X86 instruction selection to do some heroic transforms to detect a pattern which can be implemented with a small 'shl' embedded in the addressing mode scale. This happens in real code as follows: unsigned x = my_accelerator_table[input >> 11]; Here we have some lookup table that we look into using the high bits of 'input'. Each entity in the table is 4-bytes, which means this implicitly gets turned into (once lowered out of a GEP): (unsigned)((char)my_accelerator_table + ((input >> 11) << 2)); The shift right followed by a shift left is canonicalized to a smaller shift right and masking off the low bits. That hides the shift right which x86 has an addressing mode designed to support. We now detect masks of this form, and produce the longer shift right followed by the proper addressing mode. In addition to saving a (rather large) instruction, this also reduces stalls in Intel chips on benchmarks I've measured. In order for all of this to work, one part of the DAG needs to be canonicalized still further* than it currently is. This involves removing pointless 'trunc' nodes between a zextload and a zext. Without that, we end up generating spurious masks and hiding the pattern. llvm-svn: 147936 2012-01-11 09:41:08 +01:00
			`entry:`
			`%i = load i16* %i.ptr`
			`%i.zext = zext i16 %i to i32`
			`%index = lshr i32 %i.zext, 11`
			`%val.ptr = getelementptr inbounds i32* %arr, i32 %index`
			`%val = load i32* %val.ptr`
			`%sum = add i32 %val, %i.zext`
			`ret i32 %sum`
			`}`

			`define i32 @t4(i16* %i.ptr, i32* %arr) {`
			`; A version of @t3 that has more zero extends and more re-use of intermediate`
			`; values. This exercise slightly different bits of canonicalization.`
Revert r147945 which disabled an addressing mode transformation. I had hoped this would revive one of the llvm-gcc selfhost build bots, but it didn't so it doesn't appear that my transform is the culprit. If anyone else is seeing failures, please let me know! llvm-svn: 147957 2012-01-11 19:36:12 +01:00			`; CHECK: t4:`
			`; CHECK-NOT: and`
			`; CHECK: shrl`
			`; CHECK: addl (%{{...}},%{{...}},4),`
			`; CHECK: ret`
Teach the X86 instruction selection to do some heroic transforms to detect a pattern which can be implemented with a small 'shl' embedded in the addressing mode scale. This happens in real code as follows: unsigned x = my_accelerator_table[input >> 11]; Here we have some lookup table that we look into using the high bits of 'input'. Each entity in the table is 4-bytes, which means this implicitly gets turned into (once lowered out of a GEP): (unsigned)((char)my_accelerator_table + ((input >> 11) << 2)); The shift right followed by a shift left is canonicalized to a smaller shift right and masking off the low bits. That hides the shift right which x86 has an addressing mode designed to support. We now detect masks of this form, and produce the longer shift right followed by the proper addressing mode. In addition to saving a (rather large) instruction, this also reduces stalls in Intel chips on benchmarks I've measured. In order for all of this to work, one part of the DAG needs to be canonicalized still further* than it currently is. This involves removing pointless 'trunc' nodes between a zextload and a zext. Without that, we end up generating spurious masks and hiding the pattern. llvm-svn: 147936 2012-01-11 09:41:08 +01:00
			`entry:`
			`%i = load i16* %i.ptr`
			`%i.zext = zext i16 %i to i32`
			`%index = lshr i32 %i.zext, 11`
			`%index.zext = zext i32 %index to i64`
			`%val.ptr = getelementptr inbounds i32* %arr, i64 %index.zext`
			`%val = load i32* %val.ptr`
			`%sum.1 = add i32 %val, %i.zext`
			`%sum.2 = add i32 %sum.1, %index`
			`ret i32 %sum.2`
			`}`