llvm-mirror/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll

; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s

; VI-LABEL: {{^}}dpp_test:
; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
; VI: v_mov_b32_e32 v1, s{{[0-9]+}}
; VI-OPT: s_nop 1
; VI-NOOPT: s_nop 0
; VI-NOOPT: s_nop 0
; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11]
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1) #0
  store i32 %tmp0, i32 addrspace(1)* %out
  ret void
}

; VI-LABEL: {{^}}dpp_test1:
; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
; VI-NEXT: s_nop 0
; VI-NEXT: s_nop 0
; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
bb:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = zext i32 %tmp to i64
  %tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp
  %tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4
  fence syncscope("workgroup") release
  tail call void @llvm.amdgcn.s.barrier()
  fence syncscope("workgroup") acquire
  %tmp4 = add nsw i32 %tmp3, %tmp3
  %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
  %tmp6 = add nsw i32 %tmp5, %tmp4
  %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
  store i32 %tmp6, i32* %tmp7, align 4
  ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }
[AMDGPU]: Turn on the DPP combiner by default Differential revision: https://reviews.llvm.org/D55314 llvm-svn: 348371 2018-12-05 16:21:17 +01:00			`; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s \| FileCheck -check-prefix=VI -check-prefix=VI-OPT %s`
			`; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs -show-mc-encoding < %s \| FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s`
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic Summary: Now that we've made all the necessary backend changes, we can add a new intrinsic which exposes the new capabilities to IR producers. Since llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we should deprecate the former. We also add tests for all the functionality that was added in previous changes, now that we can access it via an IR construct. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34718 llvm-svn: 310399 2017-08-08 20:52:22 +02:00
			`; VI-LABEL: {{^}}dpp_test:`
			`; VI: v_mov_b32_e32 v0, s{{[0-9]+}}`
			`; VI: v_mov_b32_e32 v1, s{{[0-9]+}}`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 12:02:41 +02:00			`; VI-OPT: s_nop 1`
			`; VI-NOOPT: s_nop 0`
			`; VI-NOOPT: s_nop 0`
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic Summary: Now that we've made all the necessary backend changes, we can add a new intrinsic which exposes the new capabilities to IR producers. Since llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we should deprecate the former. We also add tests for all the functionality that was added in previous changes, now that we can access it via an IR construct. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34718 llvm-svn: 310399 2017-08-08 20:52:22 +02:00			`; VI: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11]`
			`define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {`
			`%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 1) #0`
			`store i32 %tmp0, i32 addrspace(1)* %out`
			`ret void`
			`}`

run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 12:02:41 +02:00			`; VI-LABEL: {{^}}dpp_test1:`
[AMDGPU] Divergence driven instruction selection. Part 1. Summary: This change is the first part of the AMDGPU target description change. The aim of it is the effective splitting the vector and scalar flows at the selection stage. Selection uses predicate functions based on the framework implemented earlier - https://reviews.llvm.org/D35267 Differential revision: https://reviews.llvm.org/D52019 Reviewers: rampitec llvm-svn: 342719 2018-09-21 12:31:22 +02:00			`; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 12:02:41 +02:00			`; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0`
			`; VI-NEXT: s_nop 0`
			`; VI-NEXT: s_nop 0`
Relax fast register allocator related test cases; NFC - Relex hard coded registers and stack frame sizes - Some test cleanups - Change phi-dbg.ll to match on mir output after phi elimination instead of going through the whole codegen pipeline. This is in preparation for https://reviews.llvm.org/D52010 I'm committing all the test changes upfront that work before and after independently. llvm-svn: 345532 2018-10-29 21:10:42 +01:00			`; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf`
run post-RA hazard recognizer pass late Memory legalizer, waitcnt, and shrink passes can perturb the instructions, which means that the post-RA hazard recognizer pass should run after them. Otherwise, one of those passes may invalidate the work done by the hazard recognizer. Note that this has adverse side-effect that any consecutive S_NOP 0's, emitted by the hazard recognizer, will not be shrunk into a single S_NOP <N>. This should be addressed in a follow-on patch. Differential Revision: https://reviews.llvm.org/D49288 llvm-svn: 337154 2018-07-16 12:02:41 +02:00			`@0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4`
			`define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {`
			`bb:`
			`%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()`
			`%tmp1 = zext i32 %tmp to i64`
			`%tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp`
			`%tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4`
			`fence syncscope("workgroup") release`
			`tail call void @llvm.amdgcn.s.barrier()`
			`fence syncscope("workgroup") acquire`
			`%tmp4 = add nsw i32 %tmp3, %tmp3`
			`%tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)`
			`%tmp6 = add nsw i32 %tmp5, %tmp4`
			`%tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp1`
			`store i32 %tmp6, i32* %tmp7, align 4`
			`ret void`
			`}`

			`declare i32 @llvm.amdgcn.workitem.id.x()`
			`declare void @llvm.amdgcn.s.barrier()`
[AMDGPU] Add llvm.amdgpu.update.dpp intrinsic Summary: Now that we've made all the necessary backend changes, we can add a new intrinsic which exposes the new capabilities to IR producers. Since llvm.amdgpu.update.dpp is a strict superset of llvm.amdgpu.mov.dpp, we should deprecate the former. We also add tests for all the functionality that was added in previous changes, now that we can access it via an IR construct. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34718 llvm-svn: 310399 2017-08-08 20:52:22 +02:00			`declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0`

			`attributes #0 = { nounwind readnone convergent }`