llvm-mirror/test/CodeGen/PowerPC/bdzlr.ll

; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-CRB
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"

%struct.lua_TValue.17.692 = type { %union.Value.16.691, i32 }
%union.Value.16.691 = type { %union.GCObject.15.690* }
%union.GCObject.15.690 = type { %struct.lua_State.14.689 }
%struct.lua_State.14.689 = type { %union.GCObject.15.690*, i8, i8, i8, %struct.lua_TValue.17.692*, %struct.lua_TValue.17.692*, %struct.global_State.10.685*, %struct.CallInfo.11.686*, i32*, %struct.lua_TValue.17.692*, %struct.lua_TValue.17.692*, %struct.CallInfo.11.686*, %struct.CallInfo.11.686*, i32, i32, i16, i16, i8, i8, i32, i32, void (%struct.lua_State.14.689*, %struct.lua_Debug.12.687*)*, %struct.lua_TValue.17.692, %struct.lua_TValue.17.692, %union.GCObject.15.690*, %union.GCObject.15.690*, %struct.lua_longjmp.13.688*, i64 }
%struct.global_State.10.685 = type { %struct.stringtable.0.675, i8* (i8*, i8*, i64, i64)*, i8*, i8, i8, i32, %union.GCObject.15.690*, %union.GCObject.15.690**, %union.GCObject.15.690*, %union.GCObject.15.690*, %union.GCObject.15.690*, %union.GCObject.15.690*, %struct.Mbuffer.1.676, i64, i64, i64, i64, i32, i32, i32 (%struct.lua_State.14.689*)*, %struct.lua_TValue.17.692, %struct.lua_State.14.689*, %struct.UpVal.3.678, [9 x %struct.Table.7.682*], [17 x %union.TString.9.684*] }
%struct.stringtable.0.675 = type { %union.GCObject.15.690**, i32, i32 }
%struct.Mbuffer.1.676 = type { i8*, i64, i64 }
%struct.UpVal.3.678 = type { %union.GCObject.15.690*, i8, i8, %struct.lua_TValue.17.692*, %union.anon.2.677 }
%union.anon.2.677 = type { %struct.lua_TValue.17.692 }
%struct.Table.7.682 = type { %union.GCObject.15.690*, i8, i8, i8, i8, %struct.Table.7.682*, %struct.lua_TValue.17.692*, %struct.Node.6.681*, %struct.Node.6.681*, %union.GCObject.15.690*, i32 }
%struct.Node.6.681 = type { %struct.lua_TValue.17.692, %union.TKey.5.680 }
%union.TKey.5.680 = type { %struct.anon.0.4.679 }
%struct.anon.0.4.679 = type { %union.Value.16.691, i32, %struct.Node.6.681* }
%union.TString.9.684 = type { %struct.anon.1.8.683 }
%struct.anon.1.8.683 = type { %union.GCObject.15.690*, i8, i8, i8, i32, i64 }
%struct.CallInfo.11.686 = type { %struct.lua_TValue.17.692*, %struct.lua_TValue.17.692*, %struct.lua_TValue.17.692*, i32*, i32, i32 }
%struct.lua_Debug.12.687 = type { i32, i8*, i8*, i8*, i8*, i32, i32, i32, i32, [60 x i8], i32 }
%struct.lua_longjmp.13.688 = type opaque

define void @lua_xmove(i32 signext %n) #0 {
entry:
  br i1 undef, label %for.end, label %if.end

if.end:                                           ; preds = %entry
  br i1 undef, label %for.body.lr.ph, label %for.end

for.body.lr.ph:                                   ; preds = %if.end
  br label %for.body

for.body:                                         ; preds = %for.body.for.body_crit_edge, %for.body.lr.ph
  %0 = phi %struct.lua_TValue.17.692* [ undef, %for.body.lr.ph ], [ %.pre, %for.body.for.body_crit_edge ]
  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
  %tt = getelementptr inbounds %struct.lua_TValue.17.692, %struct.lua_TValue.17.692* %0, i64 %indvars.iv, i32 1
  %1 = load i32, i32* %tt, align 4
  store i32 %1, i32* undef, align 4
  %indvars.iv.next = add i64 %indvars.iv, 1
  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  %exitcond = icmp eq i32 %lftr.wideiv, %n
  br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge

for.body.for.body_crit_edge:                      ; preds = %for.body
  %.pre = load %struct.lua_TValue.17.692*, %struct.lua_TValue.17.692** undef, align 8
  br label %for.body

for.end:                                          ; preds = %for.body, %if.end, %entry
  ret void

; CHECK: @lua_xmove
; CHECK: bnelr
; CHECK: bnelr
; CHECK: bdzlr
; CHECK-NOT: blr

; CHECK-CRB: @lua_xmove
; CHECK-CRB: bclr 12,
; CHECK-CRB: bclr 12,
; CHECK-CRB: bdzlr
; CHECK-CRB-NOT: blr
}

attributes #0 = { nounwind }
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown llvm-svn: 202451 2014-02-28 01:27:01 +01:00			`; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-crbits \| FileCheck %s`
			`; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 \| FileCheck %s -check-prefix=CHECK-CRB`
Allow PPC B and BLR to be if-converted into some predicated forms This enables us to form predicated branches (which are the same conditional branches we had before) and also a larger set of predicated returns (including instructions like bdnzlr which is a conditional return and loop-counter decrement all in one). At the moment, if conversion does not capture all possible opportunities. A simple example is provided in early-ret2.ll, where if conversion forms one predicated return, and then the PPCEarlyReturn pass picks up the other one. So, at least for now, we'll keep both mechanisms. llvm-svn: 179134 2013-04-10 00:58:37 +02:00			`target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"`
			`target triple = "powerpc64-unknown-linux-gnu"`

			`%struct.lua_TValue.17.692 = type { %union.Value.16.691, i32 }`
			`%union.Value.16.691 = type { %union.GCObject.15.690* }`
			`%union.GCObject.15.690 = type { %struct.lua_State.14.689 }`
			%struct.lua_State.14.689 = type { %union.GCObject.15.690, i8, i8, i8, %struct.lua_TValue.17.692, %struct.lua_TValue.17.692, %struct.global_State.10.685, %struct.CallInfo.11.686, i32, %struct.lua_TValue.17.692, %struct.lua_TValue.17.692, %struct.CallInfo.11.686, %struct.CallInfo.11.686, i32, i32, i16, i16, i8, i8, i32, i32, void (%struct.lua_State.14.689, %struct.lua_Debug.12.687), %struct.lua_TValue.17.692, %struct.lua_TValue.17.692, %union.GCObject.15.690, %union.GCObject.15.690, %struct.lua_longjmp.13.688, i64 }
			`%struct.global_State.10.685 = type { %struct.stringtable.0.675, i8* (i8, i8, i64, i64), i8, i8, i8, i32, %union.GCObject.15.690, %union.GCObject.15.690, %union.GCObject.15.690, %union.GCObject.15.690, %union.GCObject.15.690, %union.GCObject.15.690, %struct.Mbuffer.1.676, i64, i64, i64, i64, i32, i32, i32 (%struct.lua_State.14.689), %struct.lua_TValue.17.692, %struct.lua_State.14.689, %struct.UpVal.3.678, [9 x %struct.Table.7.682], [17 x %union.TString.9.684] }`
			`%struct.stringtable.0.675 = type { %union.GCObject.15.690**, i32, i32 }`
			`%struct.Mbuffer.1.676 = type { i8*, i64, i64 }`
			`%struct.UpVal.3.678 = type { %union.GCObject.15.690, i8, i8, %struct.lua_TValue.17.692, %union.anon.2.677 }`
			`%union.anon.2.677 = type { %struct.lua_TValue.17.692 }`
			`%struct.Table.7.682 = type { %union.GCObject.15.690, i8, i8, i8, i8, %struct.Table.7.682, %struct.lua_TValue.17.692, %struct.Node.6.681, %struct.Node.6.681, %union.GCObject.15.690, i32 }`
			`%struct.Node.6.681 = type { %struct.lua_TValue.17.692, %union.TKey.5.680 }`
			`%union.TKey.5.680 = type { %struct.anon.0.4.679 }`
			`%struct.anon.0.4.679 = type { %union.Value.16.691, i32, %struct.Node.6.681* }`
			`%union.TString.9.684 = type { %struct.anon.1.8.683 }`
			`%struct.anon.1.8.683 = type { %union.GCObject.15.690*, i8, i8, i8, i32, i64 }`
			`%struct.CallInfo.11.686 = type { %struct.lua_TValue.17.692, %struct.lua_TValue.17.692, %struct.lua_TValue.17.692, i32, i32, i32 }`
			`%struct.lua_Debug.12.687 = type { i32, i8, i8, i8, i8, i32, i32, i32, i32, [60 x i8], i32 }`
			`%struct.lua_longjmp.13.688 = type opaque`

			`define void @lua_xmove(i32 signext %n) #0 {`
			`entry:`
			`br i1 undef, label %for.end, label %if.end`

			`if.end: ; preds = %entry`
			`br i1 undef, label %for.body.lr.ph, label %for.end`

			`for.body.lr.ph: ; preds = %if.end`
			`br label %for.body`

			`for.body: ; preds = %for.body.for.body_crit_edge, %for.body.lr.ph`
			`%0 = phi %struct.lua_TValue.17.692* [ undef, %for.body.lr.ph ], [ %.pre, %for.body.for.body_crit_edge ]`
			`%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]`
[opaque pointer type] Add textual IR support for explicit type parameter to getelementptr instruction One of several parallel first steps to remove the target type of pointers, replacing them with a single opaque pointer type. This adds an explicit type parameter to the gep instruction so that when the first parameter becomes an opaque pointer type, the type to gep through is still available to the instructions. * This doesn't modify gep operators, only instructions (operators will be handled separately) * Textual IR changes only. Bitcode (including upgrade) and changing the in-memory representation will be in separate changes. * geps of vectors are transformed as: getelementptr <4 x float> %x, ... ->getelementptr float, <4 x float> %x, ... Then, once the opaque pointer type is introduced, this will ultimately look like: getelementptr float, <4 x ptr> %x with the unambiguous interpretation that it is a vector of pointers to float. * address spaces remain on the pointer, not the type: getelementptr float addrspace(1)* %x ->getelementptr float, float addrspace(1)* %x Then, eventually: getelementptr float, ptr addrspace(1) %x Importantly, the massive amount of test case churn has been automated by same crappy python code. I had to manually update a few test cases that wouldn't fit the script's model (r228970,r229196,r229197,r229198). The python script just massages stdin and writes the result to stdout, I then wrapped that in a shell script to handle replacing files, then using the usual find+xargs to migrate all the files. update.py: import fileinput import sys import re ibrep = re.compile(r"(^.?[^%\w]getelementptr inbounds )(((?:<\d x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") normrep = re.compile( r"(^.?[^%\w]getelementptr )(((?:<\d* x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") def conv(match, line): if not match: return line line = match.groups()[0] if len(match.groups()[5]) == 0: line += match.groups()[2] line += match.groups()[3] line += ", " line += match.groups()[1] line += "\n" return line for line in sys.stdin: if line.find("getelementptr ") == line.find("getelementptr inbounds"): if line.find("getelementptr inbounds") != line.find("getelementptr inbounds ("): line = conv(re.match(ibrep, line), line) elif line.find("getelementptr ") != line.find("getelementptr ("): line = conv(re.match(normrep, line), line) sys.stdout.write(line) apply.sh: for name in "$@" do python3 `dirname "$0"`/update.py < "$name" > "$name.tmp" && mv "$name.tmp" "$name" rm -f "$name.tmp" done The actual commands: From llvm/src: find test/ -name .ll \| xargs ./apply.sh From llvm/src/tools/clang: find test/ -name .mm -o -name .m -o -name .cpp -o -name .c \| xargs -I '{}' ../../apply.sh "{}" From llvm/src/tools/polly: find test/ -name *.ll \| xargs ./apply.sh After that, check-all (with llvm, clang, clang-tools-extra, lld, compiler-rt, and polly all checked out). The extra 'rm' in the apply.sh script is due to a few files in clang's test suite using interesting unicode stuff that my python script was throwing exceptions on. None of those files needed to be migrated, so it seemed sufficient to ignore those cases. Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7636 llvm-svn: 230786 2015-02-27 20:29:02 +01:00			`%tt = getelementptr inbounds %struct.lua_TValue.17.692, %struct.lua_TValue.17.692* %0, i64 %indvars.iv, i32 1`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-27 22:17:42 +01:00			`%1 = load i32, i32* %tt, align 4`
TBAA: remove !tbaa from testing cases when they are not needed. This will make it easier to turn on struct-path aware TBAA since the metadata format will change. llvm-svn: 188944 2013-08-22 00:20:53 +02:00			`store i32 %1, i32* undef, align 4`
Allow PPC B and BLR to be if-converted into some predicated forms This enables us to form predicated branches (which are the same conditional branches we had before) and also a larger set of predicated returns (including instructions like bdnzlr which is a conditional return and loop-counter decrement all in one). At the moment, if conversion does not capture all possible opportunities. A simple example is provided in early-ret2.ll, where if conversion forms one predicated return, and then the PPCEarlyReturn pass picks up the other one. So, at least for now, we'll keep both mechanisms. llvm-svn: 179134 2013-04-10 00:58:37 +02:00			`%indvars.iv.next = add i64 %indvars.iv, 1`
			`%lftr.wideiv = trunc i64 %indvars.iv.next to i32`
			`%exitcond = icmp eq i32 %lftr.wideiv, %n`
			`br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge`

			`for.body.for.body_crit_edge: ; preds = %for.body`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-27 22:17:42 +01:00			`%.pre = load %struct.lua_TValue.17.692, %struct.lua_TValue.17.692* undef, align 8`
Allow PPC B and BLR to be if-converted into some predicated forms This enables us to form predicated branches (which are the same conditional branches we had before) and also a larger set of predicated returns (including instructions like bdnzlr which is a conditional return and loop-counter decrement all in one). At the moment, if conversion does not capture all possible opportunities. A simple example is provided in early-ret2.ll, where if conversion forms one predicated return, and then the PPCEarlyReturn pass picks up the other one. So, at least for now, we'll keep both mechanisms. llvm-svn: 179134 2013-04-10 00:58:37 +02:00			`br label %for.body`

			`for.end: ; preds = %for.body, %if.end, %entry`
			`ret void`

			`; CHECK: @lua_xmove`
			`; CHECK: bnelr`
			`; CHECK: bnelr`
			`; CHECK: bdzlr`
Manually remove successors in if conversion when CopyAndPredicateBlock is used In the simple and triangle if-conversion cases, when CopyAndPredicateBlock is used because the to-be-predicated block has other predecessors, we need to explicitly remove the old copied block from the successors list. Normally if conversion relies on TII->AnalyzeBranch combined with BB->CorrectExtraCFGEdges to cleanup the successors list, but if the predicated block contained an un-analyzable branch (such as a now-predicated return), then this will fail. These extra successors were causing a problem on PPC because it was causing later passes (such as PPCEarlyReturm) to leave dead return-only basic blocks in the code. llvm-svn: 179227 2013-04-11 00:05:25 +02:00			`; CHECK-NOT: blr`
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown llvm-svn: 202451 2014-02-28 01:27:01 +01:00
			`; CHECK-CRB: @lua_xmove`
			`; CHECK-CRB: bclr 12,`
			`; CHECK-CRB: bclr 12,`
			`; CHECK-CRB: bdzlr`
			`; CHECK-CRB-NOT: blr`
Allow PPC B and BLR to be if-converted into some predicated forms This enables us to form predicated branches (which are the same conditional branches we had before) and also a larger set of predicated returns (including instructions like bdnzlr which is a conditional return and loop-counter decrement all in one). At the moment, if conversion does not capture all possible opportunities. A simple example is provided in early-ret2.ll, where if conversion forms one predicated return, and then the PPCEarlyReturn pass picks up the other one. So, at least for now, we'll keep both mechanisms. llvm-svn: 179134 2013-04-10 00:58:37 +02:00			`}`

			`attributes #0 = { nounwind }`