mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:43:36 +01:00
a7f581c4d0
Currently undef is used as a don’t-care vector when constructing a vector using a series of insertelement. However, this is problematic because undef isn’t undefined enough. Especially, a sequence of insertelement can be optimized to shufflevector, but using undef as its placeholder makes shufflevector a poison-blocking instruction because undef cannot be optimized to poison. This makes a few straightforward optimizations incorrect, such as: ``` ; https://bugs.llvm.org/show_bug.cgi?id=44185 define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) { %xv = insertelement <4 x float> %q, float %x, i32 2 %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> { 0, 6, 2, undef } ret <4 x float> %r ; %r[3] is undef } => define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) { %r = insertelement <4 x float> %y, float %x, i32 1 ret <4 x float> %r ; %r[3] = %y[3], incorrect if %y[3] = poison } Transformation doesn't verify! ERROR: Target is more poisonous than source ``` I’d like to suggest 1. Using poison as insertelement’s placeholder value (IRBuilder::CreateVectorSplat should be patched too) 2. Updating shufflevector’s semantics to return poison element if mask is undef Note that poison is currently lowered into UNDEF in SelDag, so codegen part is okay. m_Undef() matches PoisonValue as well, so existing optimizations will still fire. The only concern is hidden miscompilations that will go incorrect when poison constant is given. A conservative way is copying all tests having `insertelement undef` & replacing it with `insertelement poison` & run Alive2 on it, but it will create many tests and people won’t like it. :( Instead, I’ll simply locally maintain the tests and run Alive2. If there is any bug found, I’ll report it. Relevant links: https://bugs.llvm.org/show_bug.cgi?id=43958 , http://lists.llvm.org/pipermail/llvm-dev/2019-November/137242.html Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D93586
466 lines
19 KiB
LLVM
466 lines
19 KiB
LLVM
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
|
|
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
|
|
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
|
|
|
|
target datalayout = "A5"
|
|
|
|
; OPT-LABEL: @vector_read_alloca_bitcast(
|
|
; OPT-NOT: alloca
|
|
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
|
; OPT-NEXT: store i32 %0, i32 addrspace(1)* %out, align 4
|
|
|
|
; GCN-LABEL: {{^}}vector_read_alloca_bitcast:
|
|
; GCN-ALLOCA-COUNT-4: buffer_store_dword
|
|
; GCN-ALLOCA: buffer_load_dword
|
|
|
|
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
|
|
; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
|
|
; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
|
|
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
|
|
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
|
|
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
|
|
; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
|
|
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
|
|
; GCN-PROMOTE: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) {
|
|
entry:
|
|
%tmp = alloca [4 x i32], addrspace(5)
|
|
%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
|
|
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
|
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
|
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
|
store i32 0, i32 addrspace(5)* %x
|
|
store i32 1, i32 addrspace(5)* %y
|
|
store i32 2, i32 addrspace(5)* %z
|
|
store i32 3, i32 addrspace(5)* %w
|
|
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
|
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
|
store i32 %tmp2, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @vector_write_alloca_bitcast(
|
|
; OPT-NOT: alloca
|
|
; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
|
|
; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index
|
|
; OPT-NEXT: store i32 %1, i32 addrspace(1)* %out, align
|
|
|
|
; GCN-LABEL: {{^}}vector_write_alloca_bitcast:
|
|
; GCN-ALLOCA-COUNT-5: buffer_store_dword
|
|
; GCN-ALLOCA: buffer_load_dword
|
|
|
|
; GCN-PROMOTE-COUNT-7: v_cndmask
|
|
|
|
; GCN-PROMOTE: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
|
|
entry:
|
|
%tmp = alloca [4 x i32], addrspace(5)
|
|
%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
|
|
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
|
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
|
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
|
store i32 0, i32 addrspace(5)* %x
|
|
store i32 0, i32 addrspace(5)* %y
|
|
store i32 0, i32 addrspace(5)* %z
|
|
store i32 0, i32 addrspace(5)* %w
|
|
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
|
|
store i32 1, i32 addrspace(5)* %tmp1
|
|
%tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
|
|
%tmp3 = load i32, i32 addrspace(5)* %tmp2
|
|
store i32 %tmp3, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @vector_write_read_bitcast_to_float(
|
|
; OPT-NOT: alloca
|
|
; OPT: bb2:
|
|
; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
|
|
; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10
|
|
; OPT: .preheader:
|
|
; OPT: %bc = bitcast <6 x float> %0 to <6 x i32>
|
|
; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20
|
|
|
|
; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
|
|
; GCN-ALLOCA: buffer_store_dword
|
|
|
|
; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
|
|
; GCN-PROMOTE-COUNT-6: v_cndmask
|
|
|
|
; GCN: s_cbranch
|
|
|
|
; GCN-ALLOCA: buffer_load_dword
|
|
|
|
; GCN-PROMOTE: v_cmp_eq_u16
|
|
; GCN-PROMOTE: v_cndmask
|
|
; GCN-PROMOTE: v_cmp_eq_u16
|
|
; GCN-PROMOTE: v_cndmask
|
|
; GCN-PROMOTE: v_cmp_eq_u16
|
|
; GCN-PROMOTE: v_cndmask
|
|
; GCN-PROMOTE: v_cmp_eq_u16
|
|
; GCN-PROMOTE: v_cndmask
|
|
; GCN-PROMOTE: v_cmp_eq_u16
|
|
; GCN-PROMOTE: v_cndmask
|
|
|
|
; GCN-PROMOTE: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) {
|
|
bb:
|
|
%tmp = alloca [6 x float], align 4, addrspace(5)
|
|
%tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)*
|
|
call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
|
|
%tmp4 = zext i32 %tmp3 to i64
|
|
%tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4
|
|
%tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)*
|
|
%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
|
|
%tmp8 = trunc i32 %tmp3 to i16
|
|
%tmp9 = urem i16 %tmp8, 6
|
|
%tmp10 = zext i16 %tmp9 to i32
|
|
%tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10
|
|
%tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)*
|
|
store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4
|
|
%tmp13 = add nuw nsw i32 %tmp3, 1
|
|
%tmp14 = icmp eq i32 %tmp13, 1000
|
|
br i1 %tmp14, label %.preheader, label %bb2
|
|
|
|
bb15: ; preds = %.preheader
|
|
call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
|
|
ret void
|
|
|
|
.preheader: ; preds = %.preheader, %bb2
|
|
%tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ]
|
|
%tmp17 = trunc i32 %tmp16 to i16
|
|
%tmp18 = urem i16 %tmp17, 6
|
|
%tmp19 = sub nuw nsw i16 5, %tmp18
|
|
%tmp20 = zext i16 %tmp19 to i32
|
|
%tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20
|
|
%tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)*
|
|
%tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4
|
|
%tmp24 = zext i32 %tmp16 to i64
|
|
%tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24
|
|
%tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)*
|
|
store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4
|
|
%tmp27 = add nuw nsw i32 %tmp16, 1
|
|
%tmp28 = icmp eq i32 %tmp27, 1000
|
|
br i1 %tmp28, label %bb15, label %.preheader
|
|
}
|
|
|
|
; OPT-LABEL: @vector_write_read_bitcast_to_double(
|
|
; OPT-NOT: alloca
|
|
; OPT: bb2:
|
|
; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
|
|
; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10
|
|
; OPT: .preheader:
|
|
; OPT: %bc = bitcast <6 x double> %0 to <6 x i64>
|
|
; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20
|
|
|
|
; GCN-LABEL: {{^}}vector_write_read_bitcast_to_double:
|
|
|
|
; GCN-ALLOCA-COUNT-2: buffer_store_dword
|
|
; GCN-PROMOTE-COUNT-2: v_movreld_b32_e32
|
|
|
|
; GCN: s_cbranch
|
|
|
|
; GCN-ALLOCA-COUNT-2: buffer_load_dword
|
|
; GCN-PROMOTE-COUNT-2: v_movrels_b32_e32
|
|
|
|
; GCN-PROMOTE: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) {
|
|
bb:
|
|
%tmp = alloca [6 x double], align 8, addrspace(5)
|
|
%tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)*
|
|
call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
|
|
%tmp4 = zext i32 %tmp3 to i64
|
|
%tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4
|
|
%tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)*
|
|
%tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8
|
|
%tmp8 = trunc i32 %tmp3 to i16
|
|
%tmp9 = urem i16 %tmp8, 6
|
|
%tmp10 = zext i16 %tmp9 to i32
|
|
%tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10
|
|
%tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)*
|
|
store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8
|
|
%tmp13 = add nuw nsw i32 %tmp3, 1
|
|
%tmp14 = icmp eq i32 %tmp13, 1000
|
|
br i1 %tmp14, label %.preheader, label %bb2
|
|
|
|
bb15: ; preds = %.preheader
|
|
call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
|
ret void
|
|
|
|
.preheader: ; preds = %.preheader, %bb2
|
|
%tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ]
|
|
%tmp17 = trunc i32 %tmp16 to i16
|
|
%tmp18 = urem i16 %tmp17, 6
|
|
%tmp19 = sub nuw nsw i16 5, %tmp18
|
|
%tmp20 = zext i16 %tmp19 to i32
|
|
%tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20
|
|
%tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)*
|
|
%tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8
|
|
%tmp24 = zext i32 %tmp16 to i64
|
|
%tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24
|
|
%tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)*
|
|
store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8
|
|
%tmp27 = add nuw nsw i32 %tmp16, 1
|
|
%tmp28 = icmp eq i32 %tmp27, 1000
|
|
br i1 %tmp28, label %bb15, label %.preheader
|
|
}
|
|
|
|
; OPT-LABEL: @vector_write_read_bitcast_to_i64(
|
|
; OPT-NOT: alloca
|
|
; OPT: bb2:
|
|
; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
|
|
; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
|
|
; OPT: .preheader:
|
|
; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18
|
|
|
|
; GCN-LABEL: {{^}}vector_write_read_bitcast_to_i64:
|
|
|
|
; GCN-ALLOCA-COUNT-2: buffer_store_dword
|
|
; GCN-PROMOTE-COUNT-2: v_movreld_b32_e32
|
|
|
|
; GCN: s_cbranch
|
|
|
|
; GCN-ALLOCA-COUNT-2: buffer_load_dword
|
|
; GCN-PROMOTE-COUNT-2: v_movrels_b32_e32
|
|
|
|
; GCN-PROMOTE: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) {
|
|
bb:
|
|
%tmp = alloca [6 x i64], align 8, addrspace(5)
|
|
%tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)*
|
|
call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
|
br label %bb2
|
|
|
|
bb2: ; preds = %bb2, %bb
|
|
%tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ]
|
|
%tmp4 = zext i32 %tmp3 to i64
|
|
%tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4
|
|
%tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8
|
|
%tmp7 = trunc i32 %tmp3 to i16
|
|
%tmp8 = urem i16 %tmp7, 6
|
|
%tmp9 = zext i16 %tmp8 to i32
|
|
%tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9
|
|
store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8
|
|
%tmp11 = add nuw nsw i32 %tmp3, 1
|
|
%tmp12 = icmp eq i32 %tmp11, 1000
|
|
br i1 %tmp12, label %.preheader, label %bb2
|
|
|
|
bb13: ; preds = %.preheader
|
|
call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
|
ret void
|
|
|
|
.preheader: ; preds = %.preheader, %bb2
|
|
%tmp14 = phi i32 [ %tmp23, %.preheader ], [ 0, %bb2 ]
|
|
%tmp15 = trunc i32 %tmp14 to i16
|
|
%tmp16 = urem i16 %tmp15, 6
|
|
%tmp17 = sub nuw nsw i16 5, %tmp16
|
|
%tmp18 = zext i16 %tmp17 to i32
|
|
%tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18
|
|
%tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8
|
|
%tmp21 = zext i32 %tmp14 to i64
|
|
%tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21
|
|
store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8
|
|
%tmp23 = add nuw nsw i32 %tmp14, 1
|
|
%tmp24 = icmp eq i32 %tmp23, 1000
|
|
br i1 %tmp24, label %bb13, label %.preheader
|
|
}
|
|
|
|
; TODO: llvm.assume can be ingored
|
|
|
|
; OPT-LABEL: @vector_read_alloca_bitcast_assume(
|
|
; OPT: %tmp = alloca <4 x i32>, align 16, addrspace(5)
|
|
; OPT-NEXT: %x = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %tmp, i64 0, i64 0
|
|
; OPT-NEXT: store i32 0, i32 addrspace(5)* %x, align 16
|
|
; OPT-NEXT: %0 = load <4 x i32>, <4 x i32> addrspace(5)* %tmp, align 16
|
|
; OPT-NEXT: %1 = shufflevector <4 x i32> %0, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
|
|
; OPT-NEXT: store <4 x i32> %1, <4 x i32> addrspace(5)* %tmp, align 16
|
|
; OPT-NEXT: %2 = extractelement <4 x i32> %1, i32 %index
|
|
; OPT-NEXT: store i32 %2, i32 addrspace(1)* %out, align 4
|
|
|
|
; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
|
|
; GCN-COUNT-4: buffer_store_dword
|
|
|
|
define amdgpu_kernel void @vector_read_alloca_bitcast_assume(i32 addrspace(1)* %out, i32 %index) {
|
|
entry:
|
|
%tmp = alloca [4 x i32], addrspace(5)
|
|
%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
|
|
%cmp = icmp ne i32 addrspace(5)* %x, null
|
|
call void @llvm.assume(i1 %cmp)
|
|
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
|
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
|
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
|
store i32 0, i32 addrspace(5)* %x
|
|
store i32 1, i32 addrspace(5)* %y
|
|
store i32 2, i32 addrspace(5)* %z
|
|
store i32 3, i32 addrspace(5)* %w
|
|
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
|
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
|
store i32 %tmp2, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @vector_read_alloca_multiuse(
|
|
; OPT-NOT: alloca
|
|
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
|
; OPT-NEXT: %add2 = add nuw nsw i32 %0, 1
|
|
; OPT-NEXT: store i32 %add2, i32 addrspace(1)* %out, align 4
|
|
|
|
; GCN-LABEL: {{^}}vector_read_alloca_multiuse:
|
|
; GCN-ALLOCA-COUNT-4: buffer_store_dword
|
|
; GCN-ALLOCA: buffer_load_dword
|
|
|
|
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
|
|
; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
|
|
; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
|
|
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
|
|
; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
|
|
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc
|
|
; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
|
|
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
|
|
|
|
; GCN-PROMOTE: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_read_alloca_multiuse(i32 addrspace(1)* %out, i32 %index) {
|
|
entry:
|
|
%tmp = alloca [4 x i32], addrspace(5)
|
|
%b = bitcast [4 x i32] addrspace(5)* %tmp to float addrspace(5)*
|
|
%x = bitcast float addrspace(5)* %b to i32 addrspace(5)*
|
|
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
|
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
|
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
|
store i32 0, i32 addrspace(5)* %x
|
|
store i32 1, i32 addrspace(5)* %y
|
|
store i32 2, i32 addrspace(5)* %z
|
|
store i32 3, i32 addrspace(5)* %w
|
|
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
|
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
|
%tmp3 = load i32, i32 addrspace(5)* %x
|
|
%tmp4 = load i32, i32 addrspace(5)* %y
|
|
%add1 = add i32 %tmp2, %tmp3
|
|
%add2 = add i32 %add1, %tmp4
|
|
store i32 %add2, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @bitcast_vector_to_vector(
|
|
; OPT-NOT: alloca
|
|
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
|
|
|
|
; GCN-LABEL: {{^}}bitcast_vector_to_vector:
|
|
; GCN: v_mov_b32_e32 v0, 1
|
|
; GCN: v_mov_b32_e32 v1, 2
|
|
; GCN: v_mov_b32_e32 v2, 3
|
|
; GCN: v_mov_b32_e32 v3, 4
|
|
|
|
; GCN: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out) {
|
|
.entry:
|
|
%alloca = alloca <4 x float>, align 16, addrspace(5)
|
|
%cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
|
|
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
|
|
%load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
|
|
store <4 x i32> %load, <4 x i32> addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @vector_bitcast_from_alloca_array(
|
|
; OPT-NOT: alloca
|
|
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
|
|
|
|
; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
|
|
; GCN: v_mov_b32_e32 v0, 1
|
|
; GCN: v_mov_b32_e32 v1, 2
|
|
; GCN: v_mov_b32_e32 v2, 3
|
|
; GCN: v_mov_b32_e32 v3, 4
|
|
|
|
; GCN: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out) {
|
|
.entry:
|
|
%alloca = alloca [4 x float], align 16, addrspace(5)
|
|
%cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
|
|
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
|
|
%load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
|
|
store <4 x i32> %load, <4 x i32> addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
|
|
; OPT-NOT: alloca
|
|
; OPT: %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
|
|
; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
|
|
; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
|
|
; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
|
|
; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
|
|
; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
|
|
; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
|
|
; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
|
|
|
|
; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
|
|
; GCN: v_mov_b32_e32 v0, 1
|
|
; GCN: v_mov_b32_e32 v1, 2
|
|
; GCN: v_mov_b32_e32 v2, 3
|
|
; GCN: v_mov_b32_e32 v3, 4
|
|
|
|
; GCN: ScratchSize: 0
|
|
|
|
define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out) {
|
|
.entry:
|
|
%alloca = alloca [4 x float], align 16, addrspace(5)
|
|
%cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
|
|
store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
|
|
%load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
|
|
store [4 x i32] %load, [4 x i32] addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
|
|
; OPT-NOT: alloca
|
|
; OPT: %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
|
|
; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
|
|
; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
|
|
; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
|
|
; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
|
|
; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
|
|
; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
|
|
; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
|
|
|
|
; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
|
|
; GCN: v_mov_b32_e32 v0, 1
|
|
; GCN: v_mov_b32_e32 v1, 2
|
|
; GCN: v_mov_b32_e32 v2, 3
|
|
; GCN: v_mov_b32_e32 v3, 4
|
|
|
|
; GCN: ScratchSize: 0
|
|
|
|
%struct.v4 = type { i32, i32, i32, i32 }
|
|
|
|
define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out) {
|
|
.entry:
|
|
%alloca = alloca [4 x float], align 16, addrspace(5)
|
|
%cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
|
|
store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
|
|
%load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
|
|
store %struct.v4 %load, %struct.v4 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
|
|
|
|
declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
|
|
|
|
declare void @llvm.assume(i1)
|