llvm-mirror/test/CodeGen/AArch64/falkor-hwpf.ll

; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s
; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF

; Check that strided access metadata is added to loads in inner loops when compiling for Falkor.

; CHECK-LABEL: @hwpf1(
; CHECK: load i32, i32* %gep, align 4, !falkor.strided.access !0
; CHECK: load i32, i32* %gep2, align 4, !falkor.strided.access !0

; NOHWPF-LABEL: @hwpf1(
; NOHWPF: load i32, i32* %gep, align 4{{$}}
; NOHWPF: load i32, i32* %gep2, align 4{{$}}
define void @hwpf1(i32* %p, i32* %p2) {
entry:
  br label %loop

loop:
  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]

  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
  %load = load i32, i32* %gep

  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv
  %load2 = load i32, i32* %gep2

  %inc = add i32 %iv, 1
  %exitcnd = icmp uge i32 %inc, 1024
  br i1 %exitcnd, label %exit, label %loop

exit:
  ret void
}

; Check that outer loop strided load isn't marked.
; CHECK-LABEL: @hwpf2(
; CHECK: load i32, i32* %gep, align 4, !falkor.strided.access !0
; CHECK: load i32, i32* %gep2, align 4{{$}}

; NOHWPF-LABEL: @hwpf2(
; NOHWPF: load i32, i32* %gep, align 4{{$}}
; NOHWPF: load i32, i32* %gep2, align 4{{$}}
define void @hwpf2(i32* %p) {
entry:
  br label %loop1

loop1:
  %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
  %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]
  br label %loop2.header

loop2.header:
  br label %loop2

loop2:
  %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
  %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]
  %gep = getelementptr inbounds i32, i32* %p, i32 %iv2
  %load = load i32, i32* %gep
  %sum.inc = add i32 %sum, %load
  %inc2 = add i32 %iv2, 1
  %exitcnd2 = icmp uge i32 %inc2, 1024
  br i1 %exitcnd2, label %exit2, label %loop2

exit2:
  %gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1
  %load2 = load i32, i32* %gep2
  br label %loop1.latch

loop1.latch:
  %inc1 = add i32 %iv1, 1
  %exitcnd1 = icmp uge i32 %inc1, 1024
  br i1 %exitcnd2, label %exit, label %loop1

exit:
  ret void
}


; Check that non-strided load isn't marked.
; CHECK-LABEL: @hwpf3(
; CHECK: load i32, i32* %gep, align 4, !falkor.strided.access !0
; CHECK: load i32, i32* %gep2, align 4{{$}}

; NOHWPF-LABEL: @hwpf3(
; NOHWPF: load i32, i32* %gep, align 4{{$}}
; NOHWPF: load i32, i32* %gep2, align 4{{$}}
define void @hwpf3(i32* %p, i32* %p2) {
entry:
  br label %loop

loop:
  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]

  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
  %load = load i32, i32* %gep

  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %load
  %load2 = load i32, i32* %gep2

  %inc = add i32 %iv, 1
  %exitcnd = icmp uge i32 %inc, 1024
  br i1 %exitcnd, label %exit, label %loop

exit:
  ret void
}
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00			`; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor \| FileCheck %s`
			`; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 \| FileCheck %s --check-prefix=NOHWPF`

			`; Check that strided access metadata is added to loads in inner loops when compiling for Falkor.`

			`; CHECK-LABEL: @hwpf1(`
Infer alignment of unmarked loads in IR/bitcode parsing. For IR generated by a compiler, this is really simple: you just take the datalayout from the beginning of the file, and apply it to all the IR later in the file. For optimization testcases that don't care about the datalayout, this is also really simple: we just use the default datalayout. The complexity here comes from the fact that some LLVM tools allow overriding the datalayout: some tools have an explicit flag for this, some tools will infer a datalayout based on the code generation target. Supporting this properly required plumbing through a bunch of new machinery: we want to allow overriding the datalayout after the datalayout is parsed from the file, but before we use any information from it. Therefore, IR/bitcode parsing now has a callback to allow tools to compute the datalayout at the appropriate time. Not sure if I covered all the LLVM tools that want to use the callback. (clang? lli? Misc IR manipulation tools like llvm-link?). But this is at least enough for all the LLVM regression tests, and IR without a datalayout is not something frontends should generate. This change had some sort of weird effects for certain CodeGen regression tests: if the datalayout is overridden with a datalayout with a different program or stack address space, we now parse IR based on the overridden datalayout, instead of the one written in the file (or the default one, if none is specified). This broke a few AVR tests, and one AMDGPU test. Outside the CodeGen tests I mentioned, the test changes are all just fixing CHECK lines and moving around datalayout lines in weird places. Differential Revision: https://reviews.llvm.org/D78403 2020-05-14 21:59:45 +02:00			`; CHECK: load i32, i32* %gep, align 4, !falkor.strided.access !0`
			`; CHECK: load i32, i32* %gep2, align 4, !falkor.strided.access !0`
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00
			`; NOHWPF-LABEL: @hwpf1(`
Infer alignment of unmarked loads in IR/bitcode parsing. For IR generated by a compiler, this is really simple: you just take the datalayout from the beginning of the file, and apply it to all the IR later in the file. For optimization testcases that don't care about the datalayout, this is also really simple: we just use the default datalayout. The complexity here comes from the fact that some LLVM tools allow overriding the datalayout: some tools have an explicit flag for this, some tools will infer a datalayout based on the code generation target. Supporting this properly required plumbing through a bunch of new machinery: we want to allow overriding the datalayout after the datalayout is parsed from the file, but before we use any information from it. Therefore, IR/bitcode parsing now has a callback to allow tools to compute the datalayout at the appropriate time. Not sure if I covered all the LLVM tools that want to use the callback. (clang? lli? Misc IR manipulation tools like llvm-link?). But this is at least enough for all the LLVM regression tests, and IR without a datalayout is not something frontends should generate. This change had some sort of weird effects for certain CodeGen regression tests: if the datalayout is overridden with a datalayout with a different program or stack address space, we now parse IR based on the overridden datalayout, instead of the one written in the file (or the default one, if none is specified). This broke a few AVR tests, and one AMDGPU test. Outside the CodeGen tests I mentioned, the test changes are all just fixing CHECK lines and moving around datalayout lines in weird places. Differential Revision: https://reviews.llvm.org/D78403 2020-05-14 21:59:45 +02:00			`; NOHWPF: load i32, i32* %gep, align 4{{$}}`
			`; NOHWPF: load i32, i32* %gep2, align 4{{$}}`
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00			`define void @hwpf1(i32* %p, i32* %p2) {`
			`entry:`
			`br label %loop`

			`loop:`
			`%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]`

			`%gep = getelementptr inbounds i32, i32* %p, i32 %iv`
			`%load = load i32, i32* %gep`

			`%gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv`
			`%load2 = load i32, i32* %gep2`

			`%inc = add i32 %iv, 1`
			`%exitcnd = icmp uge i32 %inc, 1024`
			`br i1 %exitcnd, label %exit, label %loop`

			`exit:`
			`ret void`
			`}`

			`; Check that outer loop strided load isn't marked.`
			`; CHECK-LABEL: @hwpf2(`
Infer alignment of unmarked loads in IR/bitcode parsing. For IR generated by a compiler, this is really simple: you just take the datalayout from the beginning of the file, and apply it to all the IR later in the file. For optimization testcases that don't care about the datalayout, this is also really simple: we just use the default datalayout. The complexity here comes from the fact that some LLVM tools allow overriding the datalayout: some tools have an explicit flag for this, some tools will infer a datalayout based on the code generation target. Supporting this properly required plumbing through a bunch of new machinery: we want to allow overriding the datalayout after the datalayout is parsed from the file, but before we use any information from it. Therefore, IR/bitcode parsing now has a callback to allow tools to compute the datalayout at the appropriate time. Not sure if I covered all the LLVM tools that want to use the callback. (clang? lli? Misc IR manipulation tools like llvm-link?). But this is at least enough for all the LLVM regression tests, and IR without a datalayout is not something frontends should generate. This change had some sort of weird effects for certain CodeGen regression tests: if the datalayout is overridden with a datalayout with a different program or stack address space, we now parse IR based on the overridden datalayout, instead of the one written in the file (or the default one, if none is specified). This broke a few AVR tests, and one AMDGPU test. Outside the CodeGen tests I mentioned, the test changes are all just fixing CHECK lines and moving around datalayout lines in weird places. Differential Revision: https://reviews.llvm.org/D78403 2020-05-14 21:59:45 +02:00			`; CHECK: load i32, i32* %gep, align 4, !falkor.strided.access !0`
			`; CHECK: load i32, i32* %gep2, align 4{{$}}`
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00
			`; NOHWPF-LABEL: @hwpf2(`
Infer alignment of unmarked loads in IR/bitcode parsing. For IR generated by a compiler, this is really simple: you just take the datalayout from the beginning of the file, and apply it to all the IR later in the file. For optimization testcases that don't care about the datalayout, this is also really simple: we just use the default datalayout. The complexity here comes from the fact that some LLVM tools allow overriding the datalayout: some tools have an explicit flag for this, some tools will infer a datalayout based on the code generation target. Supporting this properly required plumbing through a bunch of new machinery: we want to allow overriding the datalayout after the datalayout is parsed from the file, but before we use any information from it. Therefore, IR/bitcode parsing now has a callback to allow tools to compute the datalayout at the appropriate time. Not sure if I covered all the LLVM tools that want to use the callback. (clang? lli? Misc IR manipulation tools like llvm-link?). But this is at least enough for all the LLVM regression tests, and IR without a datalayout is not something frontends should generate. This change had some sort of weird effects for certain CodeGen regression tests: if the datalayout is overridden with a datalayout with a different program or stack address space, we now parse IR based on the overridden datalayout, instead of the one written in the file (or the default one, if none is specified). This broke a few AVR tests, and one AMDGPU test. Outside the CodeGen tests I mentioned, the test changes are all just fixing CHECK lines and moving around datalayout lines in weird places. Differential Revision: https://reviews.llvm.org/D78403 2020-05-14 21:59:45 +02:00			`; NOHWPF: load i32, i32* %gep, align 4{{$}}`
			`; NOHWPF: load i32, i32* %gep2, align 4{{$}}`
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00			`define void @hwpf2(i32* %p) {`
			`entry:`
			`br label %loop1`

			`loop1:`
			`%iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]`
			`%outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]`
			`br label %loop2.header`

			`loop2.header:`
			`br label %loop2`

			`loop2:`
			`%iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]`
			`%sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]`
			`%gep = getelementptr inbounds i32, i32* %p, i32 %iv2`
			`%load = load i32, i32* %gep`
			`%sum.inc = add i32 %sum, %load`
			`%inc2 = add i32 %iv2, 1`
			`%exitcnd2 = icmp uge i32 %inc2, 1024`
			`br i1 %exitcnd2, label %exit2, label %loop2`

			`exit2:`
			`%gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1`
			`%load2 = load i32, i32* %gep2`
			`br label %loop1.latch`

			`loop1.latch:`
			`%inc1 = add i32 %iv1, 1`
			`%exitcnd1 = icmp uge i32 %inc1, 1024`
			`br i1 %exitcnd2, label %exit, label %loop1`

			`exit:`
			`ret void`
			`}`


			`; Check that non-strided load isn't marked.`
			`; CHECK-LABEL: @hwpf3(`
Infer alignment of unmarked loads in IR/bitcode parsing. For IR generated by a compiler, this is really simple: you just take the datalayout from the beginning of the file, and apply it to all the IR later in the file. For optimization testcases that don't care about the datalayout, this is also really simple: we just use the default datalayout. The complexity here comes from the fact that some LLVM tools allow overriding the datalayout: some tools have an explicit flag for this, some tools will infer a datalayout based on the code generation target. Supporting this properly required plumbing through a bunch of new machinery: we want to allow overriding the datalayout after the datalayout is parsed from the file, but before we use any information from it. Therefore, IR/bitcode parsing now has a callback to allow tools to compute the datalayout at the appropriate time. Not sure if I covered all the LLVM tools that want to use the callback. (clang? lli? Misc IR manipulation tools like llvm-link?). But this is at least enough for all the LLVM regression tests, and IR without a datalayout is not something frontends should generate. This change had some sort of weird effects for certain CodeGen regression tests: if the datalayout is overridden with a datalayout with a different program or stack address space, we now parse IR based on the overridden datalayout, instead of the one written in the file (or the default one, if none is specified). This broke a few AVR tests, and one AMDGPU test. Outside the CodeGen tests I mentioned, the test changes are all just fixing CHECK lines and moving around datalayout lines in weird places. Differential Revision: https://reviews.llvm.org/D78403 2020-05-14 21:59:45 +02:00			`; CHECK: load i32, i32* %gep, align 4, !falkor.strided.access !0`
			`; CHECK: load i32, i32* %gep2, align 4{{$}}`
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00
			`; NOHWPF-LABEL: @hwpf3(`
Infer alignment of unmarked loads in IR/bitcode parsing. For IR generated by a compiler, this is really simple: you just take the datalayout from the beginning of the file, and apply it to all the IR later in the file. For optimization testcases that don't care about the datalayout, this is also really simple: we just use the default datalayout. The complexity here comes from the fact that some LLVM tools allow overriding the datalayout: some tools have an explicit flag for this, some tools will infer a datalayout based on the code generation target. Supporting this properly required plumbing through a bunch of new machinery: we want to allow overriding the datalayout after the datalayout is parsed from the file, but before we use any information from it. Therefore, IR/bitcode parsing now has a callback to allow tools to compute the datalayout at the appropriate time. Not sure if I covered all the LLVM tools that want to use the callback. (clang? lli? Misc IR manipulation tools like llvm-link?). But this is at least enough for all the LLVM regression tests, and IR without a datalayout is not something frontends should generate. This change had some sort of weird effects for certain CodeGen regression tests: if the datalayout is overridden with a datalayout with a different program or stack address space, we now parse IR based on the overridden datalayout, instead of the one written in the file (or the default one, if none is specified). This broke a few AVR tests, and one AMDGPU test. Outside the CodeGen tests I mentioned, the test changes are all just fixing CHECK lines and moving around datalayout lines in weird places. Differential Revision: https://reviews.llvm.org/D78403 2020-05-14 21:59:45 +02:00			`; NOHWPF: load i32, i32* %gep, align 4{{$}}`
			`; NOHWPF: load i32, i32* %gep2, align 4{{$}}`
[AArch64][Falkor] Avoid HW prefetcher tag collisions (step 1) Summary: This patch is the first step in reducing HW prefetcher instruction tag collisions in inner loops for Falkor. It adds a pass that annotates IR loads with metadata to indicate that they are known to be strided loads, and adds a target lowering hook that translates this metadata to a target-specific MachineMemOperand flag. A follow on change will use this MachineMemOperand flag to re-write instructions to reduce tag collisions. Reviewers: mcrosier, t.p.northover Subscribers: aemerson, rengolin, mgorny, javed.absar, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34963 llvm-svn: 308059 2017-07-14 23:44:12 +02:00			`define void @hwpf3(i32* %p, i32* %p2) {`
			`entry:`
			`br label %loop`

			`loop:`
			`%iv = phi i32 [ 0, %entry ], [ %inc, %loop ]`

			`%gep = getelementptr inbounds i32, i32* %p, i32 %iv`
			`%load = load i32, i32* %gep`

			`%gep2 = getelementptr inbounds i32, i32* %p2, i32 %load`
			`%load2 = load i32, i32* %gep2`

			`%inc = add i32 %iv, 1`
			`%exitcnd = icmp uge i32 %inc, 1024`
			`br i1 %exitcnd, label %exit, label %loop`

			`exit:`
			`ret void`
			`}`