1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[AMDGPU] Regenerate mul24 test checks

To simplify diffs in future patch
This commit is contained in:
Simon Pilgrim 2021-07-25 15:11:42 +01:00
parent f3223b51e0
commit 60858d3f0b
3 changed files with 1497 additions and 214 deletions

View File

@ -1,17 +1,89 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC,SIVI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC,SIVI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC,GFX9 %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}test_smul24_i32:
; GCN: s_mul_i32
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
; EG: MULLO_INT
; CM: MULLO_INT
define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: test_smul24_i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s2, s4, 0x180000
; SI-NEXT: s_bfe_i32 s4, s5, 0x180000
; SI-NEXT: s_mul_i32 s4, s2, s4
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
; VI-NEXT: s_bfe_i32 s1, s1, 0x180000
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i32:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
entry:
%a.shl = shl i32 %a, 8
%a.24 = ashr i32 %a.shl, 8
@ -22,24 +94,75 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_smulhi24_i64:
; SIVI-NOT: bfe
; GCN-NOT: ashr
; SIVI: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]],
; GFX9: s_mul_hi_i32 [[RES1:s[0-9]+]],
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[RES1]]
; GCN: buffer_store_dword [[RESULT]]
; EG: ASHR
; EG: ASHR
; EG: MULHI_INT
; CM-NOT: ASHR
; CM: MULHI_INT24
; CM: MULHI_INT24
; CM: MULHI_INT24
; CM: MULHI_INT24
define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: test_smulhi24_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s0, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smulhi24_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULHI_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smulhi24_i64:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
entry:
%a.shl = shl i32 %a, 8
%a.24 = ashr i32 %a.shl, 8
@ -58,20 +181,98 @@ entry:
; unnecessary extension instructions because after legalization they
; will not be removed by SimplifyDemandedBits because there are
; multiple uses by the separate mul and mulhi.
; FUNC-LABEL: {{^}}test_smul24_i64:
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN-NOT: ashr
; SIVI-DAG: v_mul_hi_i32_i24_e32
; SIVI-DAG: s_mul_i32
; GFX9-DAG: s_mul_hi_i32
; GFX9-DAG: s_mul_i32
; GCN: buffer_store_dwordx2
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_smul24_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0x13
; SI-NEXT: s_load_dword s0, s[0:1], 0x1c
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
; SI-NEXT: s_bfe_i32 s0, s0, 0x180000
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: s_mul_i32 s1, s0, s1
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
; VI-NEXT: s_load_dword s0, s[0:1], 0x70
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0
; GFX9-NEXT: s_mul_i32 s1, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS,
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
%shl.i = shl i32 %a, 8
%shr.i = ashr i32 %shl.i, 8
%conv.i = sext i32 %shr.i to i64
@ -83,15 +284,86 @@ define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i3
ret void
}
; FUNC-LABEL: {{^}}test_smul24_i64_square:
; GCN: s_load_dword [[A:s[0-9]+]]
; SIVI-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]]
; GFX9: s_bfe_i32 [[B:s[0-9]+]], [[A]]
; GFX9-DAG: s_mul_hi_i32 s{{[0-9]+}}, [[B]], [[B]]
; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
; GCN: buffer_store_dwordx2
define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: test_smul24_i64_square:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
; SI-NEXT: s_mul_i32 s5, s4, s4
; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i64_square:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0
; VI-NEXT: s_mul_i32 s0, s0, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64_square:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0
; GFX9-NEXT: s_mul_i32 s0, s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i64_square:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W,
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i64_square:
; CM: ; %bb.0:
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T0.W, PV.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W,
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
%shl.i = shl i32 %a, 8
%shr.i = ashr i32 %shl.i, 8
%conv.i = sext i32 %shr.i to i64
@ -100,28 +372,113 @@ define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a
ret void
}
; FUNC-LABEL: {{^}}test_smul24_i33:
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN-NOT: and
; GCN-NOT: lshr
; SIVI-DAG: s_mul_i32
; SIVI-DAG: v_mul_hi_i32_i24_e32
; SI: v_lshl_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
; SI: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
; VI: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
; GFX9-DAG: s_mul_i32
; GFX9-DAG: s_mul_hi_i32
; GFX9: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31
; GFX9: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31
; GCN: buffer_store_dwordx2
define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
; SI-LABEL: test_smul24_i33:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s1, s2, 8
; SI-NEXT: s_lshl_b32 s3, s0, 8
; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_mul_i32 s1, s0, s2
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s1, s2, 8
; VI-NEXT: s_lshl_b32 s3, s0, 8
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
; VI-NEXT: s_mul_i32 s0, s0, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i33:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s2, 8
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; GFX9-NEXT: s_lshl_b32 s1, s3, 8
; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s2
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i33:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i33:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%a.shl = shl i33 %a, 9
%a.24 = ashr i33 %a.shl, 9
@ -133,21 +490,85 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_smulhi24_i33:
; SI: s_load_dword s
; SI: s_load_dword s
; SI-NOT: bfe
; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
; SI-NEXT: buffer_store_dword v[[HI]]
; GFX9: s_mul_hi_i32 s[[MUL_HI:[0-9]+]],
; GFX9-NEXT: s_and_b32 s[[HI:[0-9]+]], s[[MUL_HI]], 1
; GFX9-NEXT: v_mov_b32_e32 v[[RES:[0-9]+]], s[[HI]]
; GFX9-NEXT: buffer_store_dword v[[RES]]
define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
; SI-LABEL: test_smulhi24_i33:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[0:1], 0xd
; SI-NEXT: s_load_dword s5, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i33:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s2, 8
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; GFX9-NEXT: s_lshl_b32 s1, s3, 8
; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smulhi24_i33:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
; EG-NEXT: AND_INT T0.X, PS, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smulhi24_i33:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: AND_INT * T0.X, PV.X, 1,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%tmp0 = shl i33 %a, 9
%a_24 = ashr i33 %tmp0, 9
@ -161,12 +582,126 @@ entry:
ret void
}
; GCN-LABEL: {{^}}simplify_i24_crash:
; GCN: s_mul_i32 s[[VAL:[0-9]+]]
; GCN: v_mov_b32_e32 v[[VAL_LO:[0-9]+]], s[[VAL]]
; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], s[[VAL]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
; SI-LABEL: simplify_i24_crash:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_cbranch_scc0 BB6_2
; SI-NEXT: ; %bb.1: ; %bb7
; SI-NEXT: s_endpgm
; SI-NEXT: BB6_2: ; %bb11
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
; SI-NEXT: s_load_dword s4, s[0:1], 0xf
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s2, s2, 0x180000
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
; SI-NEXT: s_mul_i32 s4, s2, s4
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: simplify_i24_crash:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 BB6_2
; VI-NEXT: ; %bb.1: ; %bb7
; VI-NEXT: s_endpgm
; VI-NEXT: BB6_2: ; %bb11
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
; VI-NEXT: s_load_dword s0, s[0:1], 0x3c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
; VI-NEXT: s_mul_i32 s1, s1, s0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_i24_crash:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 BB6_2
; GFX9-NEXT: ; %bb.1: ; %bb7
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: BB6_2: ; %bb11
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: simplify_i24_crash:
; EG: ; %bb.0: ; %bb
; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: JUMP @5 POP:1
; EG-NEXT: ALU 10, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
; EG-NEXT: POP @5 POP:1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MOV T2.W, KC0[2].Y,
; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
; EG-NEXT: LSHR T1.X, PV.W, literal.x,
; EG-NEXT: MOV * T0.Y, PS,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: simplify_i24_crash:
; CM: ; %bb.0: ; %bb
; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: JUMP @5 POP:1
; CM-NEXT: ALU 13, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: POP @5 POP:1
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MOV T0.Y, KC0[2].Y,
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
; CM-NEXT: LSHR T1.X, T0.Y, literal.x,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
bb:
%cmp = icmp eq i32 %arg0, 0
br i1 %cmp, label %bb11, label %bb7

View File

@ -1,13 +1,56 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,FUNC %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
; FUNC-LABEL: {{^}}test_umul24_i32:
; GCN: s_mul_i32
define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
; SI-LABEL: test_umul24_i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0xffffff
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s4, s2
; SI-NEXT: s_and_b32 s2, s5, s2
; SI-NEXT: s_mul_i32 s4, s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s2, 0xffffff
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, s2
; VI-NEXT: s_and_b32 s1, s1, s2
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s1, s2, s0
; GFX9-NEXT: s_and_b32 s0, s3, s0
; GFX9-NEXT: s_mul_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%0 = shl i32 %a, 8
%a_24 = lshr i32 %0, 8
@ -18,10 +61,48 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
; GCN: s_mul_i32 [[MUL:s[0-9]+]]
; GCN: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]]
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
; SI-LABEL: test_umul24_i16_sext:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s2, 16
; SI-NEXT: s_mul_i32 s2, s2, s4
; SI-NEXT: s_sext_i32_i16 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i16_sext:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_sext:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: s_mul_i32 s2, s2, s0
; GFX9-NEXT: s_sext_i32_i16 s0, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%mul = mul i16 %a, %b
%ext = sext i16 %mul to i32
@ -29,12 +110,72 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
; SI-LABEL: test_umul24_i16_vgpr_sext:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; SI-NEXT: v_mov_b32_e32 v3, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_u32_u24_e32 v0, v2, v0
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i16_vgpr_sext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr_sext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
%ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@ -47,10 +188,48 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i16:
; GCN: s_mul_i32
; GCN: s_and_b32
define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
; SI-LABEL: test_umul24_i16:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s2, 16
; SI-NEXT: s_mul_i32 s2, s2, s4
; SI-NEXT: s_and_b32 s4, s2, 0xffff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: s_mul_i32 s2, s2, s0
; GFX9-NEXT: s_and_b32 s0, s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%mul = mul i16 %a, %b
%ext = zext i16 %mul to i32
@ -58,12 +237,69 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr:
; SI: v_mul_u32_u24_e32
; SI: v_and_b32_e32
; VI: v_mul_lo_u16
; GFX9: v_mul_lo_u16
define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
; SI-LABEL: test_umul24_i16_vgpr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; SI-NEXT: v_mov_b32_e32 v3, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_u32_u24_e32 v0, v2, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i16_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
%ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@ -76,12 +312,70 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addr
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i8_vgpr:
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
; SI-LABEL: test_umul24_i8_vgpr:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_mov_b32_e32 v3, v0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: v_mov_b32_e32 v2, v4
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: buffer_load_ubyte v0, v[3:4], s[12:15], 0 addr64
; SI-NEXT: buffer_load_ubyte v1, v[1:2], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i8_vgpr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i8_vgpr:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7]
; GFX9-NEXT: global_load_ubyte v3, v1, s[8:9]
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@ -95,13 +389,45 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
; SIVI-NOT: and
; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]],
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
; GCN-NEXT: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
; SI-LABEL: test_umulhi24_i32_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umulhi24_i32_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s0, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24_i32_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s1, s2, s0
; GFX9-NEXT: s_and_b32 s0, s3, s0
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%a.24 = and i32 %a, 16777215
%b.24 = and i32 %b, 16777215
@ -114,13 +440,55 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umulhi24:
; SIVI-NOT: and
; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]],
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
; GCN-NEXT: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
; SI-LABEL: test_umulhi24:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s7, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umulhi24:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s7, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 0xffffff
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_and_b32 s5, s6, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s7, s4
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
%a.24 = and i64 %a, 16777215
%b.24 = and i64 %b, 16777215
@ -132,14 +500,67 @@ entry:
}
; Multiply with 24-bit inputs and 64-bit output.
; FUNC-LABEL: {{^}}test_umul24_i64:
; GCN-NOT: lshr
; SIVI-DAG: s_mul_i32
; SIVI-DAG: v_mul_hi_u32_u24_e32
; GFX9-DAG: s_mul_i32
; GFX9-DAG: s_mul_hi_u32
; GCN: buffer_store_dwordx2
define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; SI-LABEL: test_umul24_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s7, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s8, 0xffffff
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_and_b32 s4, s6, s8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s5, s7, s8
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: s_mul_i32 s4, s4, s5
; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s7, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s4, 0xffffff
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_and_b32 s5, s6, s4
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s4, s7, s4
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: s_mul_i32 s5, s5, s4
; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 0xffffff
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_and_b32 s5, s6, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s7, s4
; GFX9-NEXT: s_mul_hi_u32 s6, s5, s4
; GFX9-NEXT: s_mul_i32 s5, s5, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp0 = shl i64 %a, 40
%a_24 = lshr i64 %tmp0, 40
@ -150,14 +571,49 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i64_square:
; GCN: s_load_dword [[A:s[0-9]+]]
; GCN: s_and_b32 [[B:s[0-9]+]], [[A]], 0xffffff
; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
; SIVI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
; GFX9-DAG: s_mul_hi_u32 s{{[0-9]+}}, [[B]], [[B]]
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
; SI-LABEL: test_umul24_i64_square:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[0:1], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s5, s4, 0xffffff
; SI-NEXT: s_mul_i32 s5, s5, s5
; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i64_square:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x4c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, s0, 0xffffff
; VI-NEXT: s_mul_i32 s1, s1, s1
; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s0, s0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i64_square:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0
; GFX9-NEXT: s_mul_i32 s0, s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp0 = shl i64 %a, 40
%a.24 = lshr i64 %tmp0, 40
@ -166,14 +622,52 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umulhi16_i32:
; GCN: s_and_b32
; GCN: s_and_b32
; GCN: s_mul_i32 [[MUL24:s[0-9]+]]
; SIVI: s_lshr_b32 s{{[0-9]+}}, [[MUL24]], 16
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[MUL24]]
; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[RESULT]]
define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
; SI-LABEL: test_umulhi16_i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0xffff
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s4, s2
; SI-NEXT: s_and_b32 s2, s5, s2
; SI-NEXT: s_mul_i32 s4, s4, s2
; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umulhi16_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, s2
; VI-NEXT: s_and_b32 s1, s1, s2
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: s_lshr_b32 s0, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi16_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s1, s4, s0
; GFX9-NEXT: s_and_b32 s0, s5, s0
; GFX9-NEXT: s_mul_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
entry:
%a.16 = and i32 %a, 65535
%b.16 = and i32 %b, 65535
@ -184,21 +678,63 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umul24_i33:
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN-NOT: lshr
; SIVI-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
; SIVI-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
; SIVI-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
; SIVI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
; GFX9-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
; GFX9-DAG: s_mul_hi_u32 s[[MUL_HI:[0-9]+]],
; GFX9-DAG: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1
; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
; SI-LABEL: test_umul24_i33:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s1, 0xffffff
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s3, s2, s1
; SI-NEXT: s_and_b32 s1, s0, s1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
; SI-NEXT: s_mul_i32 s3, s3, s1
; SI-NEXT: v_and_b32_e32 v1, 1, v0
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s1, 0xffffff
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s3, s2, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s1, s0, s1
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
; VI-NEXT: s_mul_i32 s3, s3, s1
; VI-NEXT: v_and_b32_e32 v1, 1, v0
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i33:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s1, s2, s0
; GFX9-NEXT: s_and_b32 s0, s3, s0
; GFX9-NEXT: s_mul_i32 s2, s1, s0
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp0 = shl i33 %a, 9
%a_24 = lshr i33 %tmp0, 9
@ -210,18 +746,51 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umulhi24_i33:
; GCN: s_load_dword s
; GCN: s_load_dword s
; SIVI-NOT: and
; GCN-NOT: lshr
; SIVI: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
; SIVI: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
; GFX9: s_mul_hi_u32 s[[MUL_HI:[0-9]+]],
; GFX9: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]]
; GCN-NEXT: buffer_store_dword v[[HI]]
define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
; SI-LABEL: test_umulhi24_i33:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[0:1], 0xd
; SI-NEXT: s_load_dword s5, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umulhi24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24_i33:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s1, s2, s0
; GFX9-NEXT: s_and_b32 s0, s3, s0
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
entry:
%tmp0 = shl i33 %a, 9
%a_24 = lshr i33 %tmp0, 9
@ -237,15 +806,16 @@ entry:
; Make sure the created any_extend is ignored to use the real bits
; being multiplied.
; GCN-LABEL: {{^}}test_umul24_anyextend_i24_src0_src1:
; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
; GCN: v_mul_u32_u24_e32 v0, v0, v1
; GCN: v_and_b32_e32 v0, 0x1fffe, v0
; GCN: v_mul_u32_u24_e32 v0, 0x63, v0
; GCN: s_setpc_b64
define i17 @test_umul24_anyextend_i24_src0_src1(i24 %a, i24 %b) {
; GCN-LABEL: test_umul24_anyextend_i24_src0_src1:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0
; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1
; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%aa = mul i24 %a, 234
%bb = mul i24 %b, 923
@ -257,19 +827,21 @@ entry:
ret i17 %arst
}
; GCN-LABEL: {{^}}test_umul24_anyextend_i23_src0_src1:
; GCN: s_mov_b32 [[U23_MASK:s[0-9]+]], 0x7fffff
; GCN-DAG: v_and_b32_e32 v0, [[U23_MASK]], v0
; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1
; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
; GCN-DAG: v_and_b32_e32 v1, s4, v1
; GCN-DAG: v_and_b32_e32 v0, 0x7ffffe, v0
; GCN: v_mul_u32_u24_e32 v0, v0, v1
; GCN: v_and_b32_e32 v0, 0x1fffe, v0
; GCN: v_mul_u32_u24_e32 v0, 0x63, v0
; GCN: s_setpc_b64
define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
; GCN-LABEL: test_umul24_anyextend_i23_src0_src1:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0x7fffff
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_and_b32_e32 v1, s4, v1
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0
; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1
; GCN-NEXT: v_and_b32_e32 v0, 0x7ffffe, v0
; GCN-NEXT: v_and_b32_e32 v1, s4, v1
; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%aa = mul i23 %a, 234
%bb = mul i23 %b, 923

View File

@ -1,10 +1,37 @@
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=FUNC,R600,CM %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=FUNC,R600,EG %s
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=CM %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
; FUNC-LABEL: {{^}}test_umul24_i32:
; CM: MULLO_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]}}.W, T{{[0-9]}}.Z
; EG: MULLO_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PS, PV.W
define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
; CM-LABEL: test_umul24_i32:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
;
; EG-LABEL: test_umul24_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.x,
; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = shl i32 %a, 8
%a_24 = lshr i32 %0, 8
@ -16,12 +43,48 @@ entry:
}
; The result must be sign-extended.
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
; R600: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
; CM: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PS, 0.0, literal.x
; R600: 16
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
; CM-LABEL: test_umul24_i16_sext:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @10, KC0[], KC1[]
; CM-NEXT: TEX 1 @6
; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X,
; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X,
; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X,
; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X,
; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; EG-LABEL: test_umul24_i16_sext:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
entry:
%mul = mul i16 %a, %b
%ext = sext i16 %mul to i32
@ -30,11 +93,48 @@ entry:
}
; The result must be sign-extended.
; FUNC-LABEL: {{^}}test_umul24_i8:
; R600: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
; CM: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PS, 0.0, literal.x
define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
; CM-LABEL: test_umul24_i8:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @10, KC0[], KC1[]
; CM-NEXT: TEX 1 @6
; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T0.X, 0.0,
; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X,
; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X,
; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X,
; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X,
; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; EG-LABEL: test_umul24_i8:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
entry:
%mul = mul i8 %a, %b
%ext = sext i8 %mul to i32
@ -42,9 +142,31 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
; R600: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
; CM-LABEL: test_umulhi24_i32_i64:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
;
; EG-LABEL: test_umulhi24_i32_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].Z, KC0[2].W,
entry:
%a.24 = and i32 %a, 16777215
%b.24 = and i32 %b, 16777215
@ -57,9 +179,31 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}test_umulhi24:
; R600: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
; CM-LABEL: test_umulhi24:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
;
; EG-LABEL: test_umulhi24:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].W, KC0[3].Y,
entry:
%a.24 = and i64 %a, 16777215
%b.24 = and i64 %b, 16777215
@ -71,10 +215,42 @@ entry:
}
; Multiply with 24-bit inputs and 64-bit output.
; FUNC-LABEL: {{^}}test_umul24_i64:
; EG; MUL_UINT24
; R600: MULHI
define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; CM-LABEL: test_umul24_i64:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: AND_INT * T0.Z, KC0[3].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x,
; CM-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
; CM-NEXT: MULHI_UINT24 T1.X (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_UINT24 T1.Y, KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
;
; EG-LABEL: test_umul24_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: MULHI_UINT24 * T1.Y, KC0[2].W, KC0[3].Y,
entry:
%tmp0 = shl i64 %a, 40
%a_24 = lshr i64 %tmp0, 40