mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[DAGCombine] (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
We already have a combine for this pattern when the input to shl is add, so we just need to enable the transformation when the input is or. Original patch by @tstellar Differential Revision: https://reviews.llvm.org/D19325 llvm-svn: 313251
This commit is contained in:
parent
8bc89997a1
commit
a9a617e651
@ -5595,16 +5595,18 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
|
||||
}
|
||||
|
||||
// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
|
||||
// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
|
||||
// Variant of version done on multiply, except mul by a power of 2 is turned
|
||||
// into a shift.
|
||||
if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
|
||||
if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
|
||||
N0.getNode()->hasOneUse() &&
|
||||
isConstantOrConstantVector(N1, /* No Opaques */ true) &&
|
||||
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
|
||||
SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
|
||||
SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
|
||||
AddToWorklist(Shl0.getNode());
|
||||
AddToWorklist(Shl1.getNode());
|
||||
return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
|
||||
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
|
||||
}
|
||||
|
||||
// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
|
||||
|
@ -70,13 +70,13 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa
|
||||
|
||||
; FIXME: single bit op
|
||||
; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
|
||||
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
||||
; CI: v_or_b32_e32 [[OR:v[0-9]+]], [[MASK]], v{{[0-9]+}}
|
||||
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[OR]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SHL]]
|
||||
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
|
||||
; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
|
||||
; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
||||
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
; CIVI: flat_store_dword
|
||||
|
||||
; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
|
||||
@ -88,15 +88,14 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fneg_fabs_v4f16:
|
||||
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
||||
; CI: v_or_b32_e32 [[OR00:v[0-9]+]], [[MASK]], v{{[0-9]+}}
|
||||
; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, [[OR00]]
|
||||
; CI: v_or_b32_e32 [[OR01:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR01]]
|
||||
; CI: v_or_b32_e32 [[OR10:v[0-9]+]], [[MASK]], v{{[0-9]+}}
|
||||
; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, [[OR10]]
|
||||
; CI: v_or_b32_e32 [[OR11:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR11]]
|
||||
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
|
||||
; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
|
||||
; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
|
||||
; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
|
||||
; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
|
||||
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
|
||||
; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
|
||||
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
|
||||
|
@ -27,4 +27,27 @@ define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %ba
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}ds_bpermute_add_shl:
|
||||
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
|
||||
; CHECK: s_waitcnt lgkmcnt
|
||||
define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
|
||||
%index = add i32 %base_index, 1
|
||||
%byte_index = shl i32 %index, 2
|
||||
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
|
||||
store i32 %bpermute, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}ds_bpermute_or_shl:
|
||||
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
|
||||
; CHECK: s_waitcnt lgkmcnt
|
||||
define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
|
||||
%masked = and i32 %base_index, 62
|
||||
%index = or i32 %masked, 1
|
||||
%byte_index = shl i32 %index, 2
|
||||
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
|
||||
store i32 %bpermute, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone convergent }
|
||||
|
@ -476,4 +476,28 @@ define amdgpu_kernel void @test_mul2(i32 %p) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}shl_or_k:
|
||||
; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
; SI: v_or_b32_e32 [[OR:v[0-9]+]], 4, [[SHL]]
|
||||
; SI: buffer_store_dword [[OR]]
|
||||
define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
|
||||
%tmp0 = or i32 %in, 1
|
||||
%tmp2 = shl i32 %tmp0, 2
|
||||
store i32 %tmp2, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}shl_or_k_two_uses:
|
||||
; SI: v_or_b32_e32 [[OR:v[0-9]+]], 1, v{{[0-9]+}}
|
||||
; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, [[OR]]
|
||||
; SI-DAG: buffer_store_dword [[OR]]
|
||||
; SI-DAG: buffer_store_dword [[SHL]]
|
||||
define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
|
||||
%tmp0 = or i32 %in, 1
|
||||
%tmp2 = shl i32 %tmp0, 2
|
||||
store i32 %tmp2, i32 addrspace(1)* %out0
|
||||
store i32 %tmp0, i32 addrspace(1)* %out1
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
@ -537,19 +537,19 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
|
||||
ret <4 x i32> %2
|
||||
}
|
||||
|
||||
; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
|
||||
; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
|
||||
define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
|
||||
; SSE-LABEL: combine_vec_shl_or0:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: pslld $2, %xmm0
|
||||
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_vec_shl_or0:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5]
|
||||
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
|
||||
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
|
||||
%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
|
||||
@ -559,14 +559,14 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
|
||||
define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
|
||||
; SSE-LABEL: combine_vec_shl_or1:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: por {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: combine_vec_shl_or1:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
|
||||
%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
|
||||
|
Loading…
Reference in New Issue
Block a user