llvm-mirror/test/CodeGen/R600/operand-folding.ll

; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s

; CHECK-LABEL: {{^}}fold_sgpr:
; CHECK: v_add_i32_e32 v{{[0-9]+}}, s
define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
entry:
  %tmp0 = icmp ne i32 %fold, 0
  br i1 %tmp0, label %if, label %endif

if:
  %id = call i32 @llvm.r600.read.tidig.x()
  %offset = add i32 %fold, %id
  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
  store i32 0, i32 addrspace(1)* %tmp1
  br label %endif

endif:
  ret void
}

; CHECK-LABEL: {{^}}fold_imm:
; CHECK v_or_i32_e32 v{{[0-9]+}}, 5
define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
entry:
  %fold = add i32 3, 2
  %tmp0 = icmp ne i32 %cmp, 0
  br i1 %tmp0, label %if, label %endif

if:
  %id = call i32 @llvm.r600.read.tidig.x()
  %val = or i32 %id, %fold
  store i32 %val, i32 addrspace(1)* %out
  br label %endif

endif:
  ret void
}

; CHECK-LABEL: {{^}}fold_64bit_constant_add:
; CHECK-NOT: s_mov_b64
; FIXME: It would be better if we could use v_add here and drop the extra
; v_mov_b32 instructions.
; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},

define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
entry:
  %tmp0 = add i64 %val, 1
  store i64 %tmp0, i64 addrspace(1)* %out
  ret void
}

; Inline constants should always be folded.

; CHECK-LABEL: {{^}}vector_inline:
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}

define void @vector_inline(<4 x i32> addrspace(1)* %out) {
entry:
  %tmp0 = call i32 @llvm.r600.read.tidig.x()
  %tmp1 = add i32 %tmp0, 1
  %tmp2 = add i32 %tmp0, 2
  %tmp3 = add i32 %tmp0, 3
  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
  %tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3
  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
  ret void
}

; Immediates with one use should be folded
; CHECK-LABEL: {{^}}imm_one_use:
; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}

define void @imm_one_use(i32 addrspace(1)* %out) {
entry:
  %tmp0 = call i32 @llvm.r600.read.tidig.x()
  %tmp1 = xor i32 %tmp0, 100
  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
}
; CHECK-LABEL: {{^}}vector_imm:
; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}

define void @vector_imm(<4 x i32> addrspace(1)* %out) {
entry:
  %tmp0 = call i32 @llvm.r600.read.tidig.x()
  %tmp1 = add i32 %tmp0, 1
  %tmp2 = add i32 %tmp0, 2
  %tmp3 = add i32 %tmp0, 3
  %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0
  %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1
  %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2
  %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3
  %tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3
  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out
  ret void
}

declare i32 @llvm.r600.read.tidig.x() #0
attributes #0 = { readnone }
R600/SI: Add a stub GCNTargetMachine This is equivalent to the AMDGPUTargetMachine now, but it is the starting point for separating R600 and GCN functionality into separate targets. It is recommened that users start using the gcn triple for GCN-based GPUs, because using the r600 triple for these GPUs will be deprecated in the future. llvm-svn: 225277 2015-01-06 19:00:21 +01:00			`; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs \| FileCheck %s`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-21 23:06:37 +01:00
			`; CHECK-LABEL: {{^}}fold_sgpr:`
			`; CHECK: v_add_i32_e32 v{{[0-9]+}}, s`
			`define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {`
			`entry:`
			`%tmp0 = icmp ne i32 %fold, 0`
			`br i1 %tmp0, label %if, label %endif`

			`if:`
			`%id = call i32 @llvm.r600.read.tidig.x()`
			`%offset = add i32 %fold, %id`
[opaque pointer type] Add textual IR support for explicit type parameter to getelementptr instruction One of several parallel first steps to remove the target type of pointers, replacing them with a single opaque pointer type. This adds an explicit type parameter to the gep instruction so that when the first parameter becomes an opaque pointer type, the type to gep through is still available to the instructions. * This doesn't modify gep operators, only instructions (operators will be handled separately) * Textual IR changes only. Bitcode (including upgrade) and changing the in-memory representation will be in separate changes. * geps of vectors are transformed as: getelementptr <4 x float> %x, ... ->getelementptr float, <4 x float> %x, ... Then, once the opaque pointer type is introduced, this will ultimately look like: getelementptr float, <4 x ptr> %x with the unambiguous interpretation that it is a vector of pointers to float. * address spaces remain on the pointer, not the type: getelementptr float addrspace(1)* %x ->getelementptr float, float addrspace(1)* %x Then, eventually: getelementptr float, ptr addrspace(1) %x Importantly, the massive amount of test case churn has been automated by same crappy python code. I had to manually update a few test cases that wouldn't fit the script's model (r228970,r229196,r229197,r229198). The python script just massages stdin and writes the result to stdout, I then wrapped that in a shell script to handle replacing files, then using the usual find+xargs to migrate all the files. update.py: import fileinput import sys import re ibrep = re.compile(r"(^.?[^%\w]getelementptr inbounds )(((?:<\d x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") normrep = re.compile( r"(^.?[^%\w]getelementptr )(((?:<\d* x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") def conv(match, line): if not match: return line line = match.groups()[0] if len(match.groups()[5]) == 0: line += match.groups()[2] line += match.groups()[3] line += ", " line += match.groups()[1] line += "\n" return line for line in sys.stdin: if line.find("getelementptr ") == line.find("getelementptr inbounds"): if line.find("getelementptr inbounds") != line.find("getelementptr inbounds ("): line = conv(re.match(ibrep, line), line) elif line.find("getelementptr ") != line.find("getelementptr ("): line = conv(re.match(normrep, line), line) sys.stdout.write(line) apply.sh: for name in "$@" do python3 `dirname "$0"`/update.py < "$name" > "$name.tmp" && mv "$name.tmp" "$name" rm -f "$name.tmp" done The actual commands: From llvm/src: find test/ -name .ll \| xargs ./apply.sh From llvm/src/tools/clang: find test/ -name .mm -o -name .m -o -name .cpp -o -name .c \| xargs -I '{}' ../../apply.sh "{}" From llvm/src/tools/polly: find test/ -name *.ll \| xargs ./apply.sh After that, check-all (with llvm, clang, clang-tools-extra, lld, compiler-rt, and polly all checked out). The extra 'rm' in the apply.sh script is due to a few files in clang's test suite using interesting unicode stuff that my python script was throwing exceptions on. None of those files needed to be migrated, so it seemed sufficient to ignore those cases. Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7636 llvm-svn: 230786 2015-02-27 20:29:02 +01:00			`%tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset`
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-21 23:06:37 +01:00			`store i32 0, i32 addrspace(1)* %tmp1`
			`br label %endif`

			`endif:`
			`ret void`
			`}`

			`; CHECK-LABEL: {{^}}fold_imm:`
			`; CHECK v_or_i32_e32 v{{[0-9]+}}, 5`
			`define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {`
			`entry:`
			`%fold = add i32 3, 2`
			`%tmp0 = icmp ne i32 %cmp, 0`
			`br i1 %tmp0, label %if, label %endif`

			`if:`
			`%id = call i32 @llvm.r600.read.tidig.x()`
			`%val = or i32 %id, %fold`
			`store i32 %val, i32 addrspace(1)* %out`
			`br label %endif`

			`endif:`
			`ret void`
			`}`

R600/SI: Teach SIFoldOperands to split 64-bit constants when folding This allows folding of sequences like: s[0:1] = s_mov_b64 4 v_add_i32 v0, s0, v0 v_addc_u32 v1, s1, v1 into v_add_i32 v0, 4, v0 v_add_i32 v1, 0, v1 llvm-svn: 225369 2015-01-07 20:56:17 +01:00			`; CHECK-LABEL: {{^}}fold_64bit_constant_add:`
			`; CHECK-NOT: s_mov_b64`
			`; FIXME: It would be better if we could use v_add here and drop the extra`
			`; v_mov_b32 instructions.`
			`; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1`
			`; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0`
			`; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]`
			`; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]`
			`; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},`

			`define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {`
			`entry:`
			`%tmp0 = add i64 %val, 1`
			`store i64 %tmp0, i64 addrspace(1)* %out`
			`ret void`
			`}`

R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-07 23:18:27 +01:00			`; Inline constants should always be folded.`

			`; CHECK-LABEL: {{^}}vector_inline:`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}`

			`define void @vector_inline(<4 x i32> addrspace(1)* %out) {`
			`entry:`
			`%tmp0 = call i32 @llvm.r600.read.tidig.x()`
			`%tmp1 = add i32 %tmp0, 1`
			`%tmp2 = add i32 %tmp0, 2`
			`%tmp3 = add i32 %tmp0, 3`
			`%vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0`
			`%vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1`
			`%vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2`
			`%vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3`
			`%tmp4 = xor <4 x i32> <i32 5, i32 5, i32 5, i32 5>, %vec3`
			`store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out`
			`ret void`
			`}`

			`; Immediates with one use should be folded`
			`; CHECK-LABEL: {{^}}imm_one_use:`
			`; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}`

			`define void @imm_one_use(i32 addrspace(1)* %out) {`
			`entry:`
			`%tmp0 = call i32 @llvm.r600.read.tidig.x()`
			`%tmp1 = xor i32 %tmp0, 100`
			`store i32 %tmp1, i32 addrspace(1)* %out`
			`ret void`
			`}`
R600/SI: Remove SIISelLowering::legalizeOperands() Its functionality has been replaced by calling SIInstrInfo::legalizeOperands() from SIISelLowering::AdjstInstrPostInstrSelection() and running the SIFoldOperands and SIShrinkInstructions passes. llvm-svn: 225445 2015-01-08 16:08:17 +01:00			`; CHECK-LABEL: {{^}}vector_imm:`
			`; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`
			`; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}`

			`define void @vector_imm(<4 x i32> addrspace(1)* %out) {`
			`entry:`
			`%tmp0 = call i32 @llvm.r600.read.tidig.x()`
			`%tmp1 = add i32 %tmp0, 1`
			`%tmp2 = add i32 %tmp0, 2`
			`%tmp3 = add i32 %tmp0, 3`
			`%vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0`
			`%vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1`
			`%vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2`
			`%vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3`
			`%tmp4 = xor <4 x i32> <i32 100, i32 100, i32 100, i32 100>, %vec3`
			`store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out`
			`ret void`
			`}`
R600/SI: Only fold immediates that have one use Folding the same immediate into multiple instruction will increase program size, which can hurt performance. llvm-svn: 225405 2015-01-07 23:18:27 +01:00
R600/SI: Add SIFoldOperands pass This pass attempts to fold the source operands of mov and copy instructions into their uses. llvm-svn: 222581 2014-11-21 23:06:37 +01:00			`declare i32 @llvm.r600.read.tidig.x() #0`
			`attributes #0 = { readnone }`