mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[AMDGPU] Regenerate mul24 test checks
To simplify diffs in future patch
This commit is contained in:
parent
f3223b51e0
commit
60858d3f0b
@ -1,17 +1,89 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC,SIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC,SIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC,GFX9 %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}test_smul24_i32:
|
||||
; GCN: s_mul_i32
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
|
||||
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
|
||||
|
||||
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
|
||||
; EG: MULLO_INT
|
||||
|
||||
; CM: MULLO_INT
|
||||
define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
||||
; SI-LABEL: test_smul24_i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_bfe_i32 s2, s4, 0x180000
|
||||
; SI-NEXT: s_bfe_i32 s4, s5, 0x180000
|
||||
; SI-NEXT: s_mul_i32 s4, s2, s4
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_smul24_i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
||||
; VI-NEXT: s_bfe_i32 s1, s1, 0x180000
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_smul24_i32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
||||
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
|
||||
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_smul24_i32:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test_smul24_i32:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
|
||||
; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
|
||||
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
|
||||
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
||||
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
|
||||
entry:
|
||||
%a.shl = shl i32 %a, 8
|
||||
%a.24 = ashr i32 %a.shl, 8
|
||||
@ -22,24 +94,75 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_smulhi24_i64:
|
||||
; SIVI-NOT: bfe
|
||||
; GCN-NOT: ashr
|
||||
; SIVI: v_mul_hi_i32_i24_e32 [[RESULT:v[0-9]+]],
|
||||
; GFX9: s_mul_hi_i32 [[RES1:s[0-9]+]],
|
||||
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[RES1]]
|
||||
; GCN: buffer_store_dword [[RESULT]]
|
||||
|
||||
; EG: ASHR
|
||||
; EG: ASHR
|
||||
; EG: MULHI_INT
|
||||
|
||||
; CM-NOT: ASHR
|
||||
; CM: MULHI_INT24
|
||||
; CM: MULHI_INT24
|
||||
; CM: MULHI_INT24
|
||||
; CM: MULHI_INT24
|
||||
define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
||||
; SI-LABEL: test_smulhi24_i64:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_smulhi24_i64:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s0, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_smulhi24_i64:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
||||
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
|
||||
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_smulhi24_i64:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MULHI_INT * T1.X, PS, PV.W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test_smulhi24_i64:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
|
||||
; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
|
||||
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
|
||||
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
|
||||
entry:
|
||||
%a.shl = shl i32 %a, 8
|
||||
%a.24 = ashr i32 %a.shl, 8
|
||||
@ -58,20 +181,98 @@ entry:
|
||||
; unnecessary extension instructions because after legalization they
|
||||
; will not be removed by SimplifyDemandedBits because there are
|
||||
; multiple uses by the separate mul and mulhi.
|
||||
|
||||
; FUNC-LABEL: {{^}}test_smul24_i64:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
|
||||
; GCN-NOT: ashr
|
||||
|
||||
; SIVI-DAG: v_mul_hi_i32_i24_e32
|
||||
; SIVI-DAG: s_mul_i32
|
||||
; GFX9-DAG: s_mul_hi_i32
|
||||
; GFX9-DAG: s_mul_i32
|
||||
|
||||
; GCN: buffer_store_dwordx2
|
||||
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
|
||||
; SI-LABEL: test_smul24_i64:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0x13
|
||||
; SI-NEXT: s_load_dword s0, s[0:1], 0x1c
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
|
||||
; SI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: s_mul_i32 s1, s0, s1
|
||||
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_smul24_i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x70
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
|
||||
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_smul24_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
|
||||
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
||||
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
|
||||
; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0
|
||||
; GFX9-NEXT: s_mul_i32 s1, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_smul24_i64:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS,
|
||||
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test_smul24_i64:
|
||||
; CM: ; %bb.0:
|
||||
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x,
|
||||
; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
|
||||
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
|
||||
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
||||
; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W,
|
||||
; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
|
||||
; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
|
||||
; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
|
||||
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
|
||||
; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
|
||||
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
|
||||
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
|
||||
%shl.i = shl i32 %a, 8
|
||||
%shr.i = ashr i32 %shl.i, 8
|
||||
%conv.i = sext i32 %shr.i to i64
|
||||
@ -83,15 +284,86 @@ define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i3
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_smul24_i64_square:
|
||||
; GCN: s_load_dword [[A:s[0-9]+]]
|
||||
; SIVI-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
||||
; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[A]], [[A]]
|
||||
; GFX9: s_bfe_i32 [[B:s[0-9]+]], [[A]]
|
||||
; GFX9-DAG: s_mul_hi_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||
; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||
; GCN: buffer_store_dwordx2
|
||||
define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
||||
; SI-LABEL: test_smul24_i64_square:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
|
||||
; SI-NEXT: s_mul_i32 s5, s4, s4
|
||||
; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_smul24_i64_square:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
||||
; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_smul24_i64_square:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
||||
; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0
|
||||
; GFX9-NEXT: s_mul_i32 s0, s0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_smul24_i64_square:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W,
|
||||
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test_smul24_i64_square:
|
||||
; CM: ; %bb.0:
|
||||
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: ASHR * T0.W, PV.W, literal.y,
|
||||
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
||||
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W,
|
||||
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W,
|
||||
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W,
|
||||
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W,
|
||||
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
|
||||
; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
|
||||
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
|
||||
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
|
||||
%shl.i = shl i32 %a, 8
|
||||
%shr.i = ashr i32 %shl.i, 8
|
||||
%conv.i = sext i32 %shr.i to i64
|
||||
@ -100,28 +372,113 @@ define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_smul24_i33:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
|
||||
; GCN-NOT: and
|
||||
; GCN-NOT: lshr
|
||||
|
||||
; SIVI-DAG: s_mul_i32
|
||||
; SIVI-DAG: v_mul_hi_i32_i24_e32
|
||||
; SI: v_lshl_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
|
||||
; SI: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 31
|
||||
|
||||
; VI: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
|
||||
; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
|
||||
|
||||
; GFX9-DAG: s_mul_i32
|
||||
; GFX9-DAG: s_mul_hi_i32
|
||||
; GFX9: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31
|
||||
; GFX9: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 31
|
||||
|
||||
; GCN: buffer_store_dwordx2
|
||||
define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
|
||||
; SI-LABEL: test_smul24_i33:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; SI-NEXT: s_lshl_b32 s3, s0, 8
|
||||
; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
|
||||
; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: s_mul_i32 s1, s0, s2
|
||||
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
|
||||
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_smul24_i33:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; VI-NEXT: s_lshl_b32 s3, s0, 8
|
||||
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
|
||||
; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
|
||||
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_smul24_i33:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s3, 8
|
||||
; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
|
||||
; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2
|
||||
; GFX9-NEXT: s_mul_i32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31
|
||||
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_smul24_i33:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
|
||||
; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W,
|
||||
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test_smul24_i33:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
|
||||
; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
|
||||
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%a.shl = shl i33 %a, 9
|
||||
%a.24 = ashr i33 %a.shl, 9
|
||||
@ -133,21 +490,85 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_smulhi24_i33:
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; SI-NOT: bfe
|
||||
|
||||
; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
|
||||
; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
||||
; SI-NEXT: buffer_store_dword v[[HI]]
|
||||
|
||||
; GFX9: s_mul_hi_i32 s[[MUL_HI:[0-9]+]],
|
||||
; GFX9-NEXT: s_and_b32 s[[HI:[0-9]+]], s[[MUL_HI]], 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v[[RES:[0-9]+]], s[[HI]]
|
||||
; GFX9-NEXT: buffer_store_dword v[[RES]]
|
||||
define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
|
||||
; SI-LABEL: test_smulhi24_i33:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0xd
|
||||
; SI-NEXT: s_load_dword s5, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
|
||||
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_smulhi24_i33:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
|
||||
; VI-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_smulhi24_i33:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s3, 8
|
||||
; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
|
||||
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_smulhi24_i33:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
|
||||
; EG-NEXT: AND_INT T0.X, PS, 1,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test_smulhi24_i33:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: AND_INT * T0.X, PV.X, 1,
|
||||
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%tmp0 = shl i33 %a, 9
|
||||
%a_24 = ashr i33 %tmp0, 9
|
||||
@ -161,12 +582,126 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}simplify_i24_crash:
|
||||
; GCN: s_mul_i32 s[[VAL:[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 v[[VAL_LO:[0-9]+]], s[[VAL]]
|
||||
; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], s[[VAL]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
|
||||
define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
|
||||
; SI-LABEL: simplify_i24_crash:
|
||||
; SI: ; %bb.0: ; %bb
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; SI-NEXT: s_cbranch_scc0 BB6_2
|
||||
; SI-NEXT: ; %bb.1: ; %bb7
|
||||
; SI-NEXT: s_endpgm
|
||||
; SI-NEXT: BB6_2: ; %bb11
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0xf
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_bfe_i32 s2, s2, 0x180000
|
||||
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
|
||||
; SI-NEXT: s_mul_i32 s4, s2, s4
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: simplify_i24_crash:
|
||||
; VI: ; %bb.0: ; %bb
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-NEXT: s_cbranch_scc0 BB6_2
|
||||
; VI-NEXT: ; %bb.1: ; %bb7
|
||||
; VI-NEXT: s_endpgm
|
||||
; VI-NEXT: BB6_2: ; %bb11
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x3c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_bfe_i32 s1, s2, 0x180000
|
||||
; VI-NEXT: s_bfe_i32 s0, s0, 0x180000
|
||||
; VI-NEXT: s_mul_i32 s1, s1, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: simplify_i24_crash:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_cbranch_scc0 BB6_2
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb7
|
||||
; GFX9-NEXT: s_endpgm
|
||||
; GFX9-NEXT: BB6_2: ; %bb11
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x3c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
|
||||
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
|
||||
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: simplify_i24_crash:
|
||||
; EG: ; %bb.0: ; %bb
|
||||
; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: JUMP @5 POP:1
|
||||
; EG-NEXT: ALU 10, @8, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
|
||||
; EG-NEXT: POP @5 POP:1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: ALU clause starting at 6:
|
||||
; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
|
||||
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
|
||||
; EG-NEXT: ALU clause starting at 8:
|
||||
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
|
||||
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: MOV T2.W, KC0[2].Y,
|
||||
; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
|
||||
; EG-NEXT: LSHR T1.X, PV.W, literal.x,
|
||||
; EG-NEXT: MOV * T0.Y, PS,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: simplify_i24_crash:
|
||||
; CM: ; %bb.0: ; %bb
|
||||
; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: JUMP @5 POP:1
|
||||
; CM-NEXT: ALU 13, @8, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
||||
; CM-NEXT: POP @5 POP:1
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: ALU clause starting at 6:
|
||||
; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
|
||||
; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
|
||||
; CM-NEXT: ALU clause starting at 8:
|
||||
; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
|
||||
; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: MOV T0.Y, KC0[2].Y,
|
||||
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
|
||||
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
|
||||
; CM-NEXT: LSHR T1.X, T0.Y, literal.x,
|
||||
; CM-NEXT: MOV * T0.Y, PV.X,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
bb:
|
||||
%cmp = icmp eq i32 %arg0, 0
|
||||
br i1 %cmp, label %bb11, label %bb7
|
||||
|
@ -1,13 +1,56 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i32:
|
||||
; GCN: s_mul_i32
|
||||
define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
; SI-LABEL: test_umul24_i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, 0xffffff
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s4, s4, s2
|
||||
; SI-NEXT: s_and_b32 s2, s5, s2
|
||||
; SI-NEXT: s_mul_i32 s4, s4, s2
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s2, 0xffffff
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s0, s0, s2
|
||||
; VI-NEXT: s_and_b32 s1, s1, s2
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s3, s0
|
||||
; GFX9-NEXT: s_mul_i32 s0, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = shl i32 %a, 8
|
||||
%a_24 = lshr i32 %0, 8
|
||||
@ -18,10 +61,48 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
|
||||
; GCN: s_mul_i32 [[MUL:s[0-9]+]]
|
||||
; GCN: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]]
|
||||
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
||||
; SI-LABEL: test_umul24_i16_sext:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; SI-NEXT: s_mul_i32 s2, s2, s4
|
||||
; SI-NEXT: s_sext_i32_i16 s4, s2
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i16_sext:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s1
|
||||
; VI-NEXT: s_sext_i32_i16 s0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i16_sext:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: s_mul_i32 s2, s2, s0
|
||||
; GFX9-NEXT: s_sext_i32_i16 s0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%mul = mul i16 %a, %b
|
||||
%ext = sext i16 %mul to i32
|
||||
@ -29,12 +110,72 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
|
||||
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
|
||||
define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
||||
; SI-LABEL: test_umul24_i16_vgpr_sext:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
|
||||
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_u32_u24_e32 v0, v2, v0
|
||||
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i16_vgpr_sext:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; VI-NEXT: flat_load_ushort v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ushort v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i16_vgpr_sext:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
|
||||
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
|
||||
@ -47,10 +188,48 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i16:
|
||||
; GCN: s_mul_i32
|
||||
; GCN: s_and_b32
|
||||
define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
||||
; SI-LABEL: test_umul24_i16:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_lshr_b32 s4, s2, 16
|
||||
; SI-NEXT: s_mul_i32 s2, s2, s4
|
||||
; SI-NEXT: s_and_b32 s4, s2, 0xffff
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i16:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 0xffff
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i16:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX9-NEXT: s_mul_i32 s2, s2, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s2, 0xffff
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%mul = mul i16 %a, %b
|
||||
%ext = zext i16 %mul to i32
|
||||
@ -58,12 +237,69 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i16_vgpr:
|
||||
; SI: v_mul_u32_u24_e32
|
||||
; SI: v_and_b32_e32
|
||||
; VI: v_mul_lo_u16
|
||||
; GFX9: v_mul_lo_u16
|
||||
define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
||||
; SI-LABEL: test_umul24_i16_vgpr:
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v3, 0
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64
|
||||
; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_u32_u24_e32 v0, v2, v0
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i16_vgpr:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ushort v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ushort v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i16_vgpr:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
|
||||
@ -76,12 +312,70 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addr
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i8_vgpr:
|
||||
; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; GFX9: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
|
||||
define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
|
||||
; SI-LABEL: test_umul24_i8_vgpr:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: v_mov_b32_e32 v3, v0
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s14, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v4, 0
|
||||
; SI-NEXT: s_mov_b32 s15, s11
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v4
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
|
||||
; SI-NEXT: buffer_load_ubyte v0, v[3:4], s[12:15], 0 addr64
|
||||
; SI-NEXT: buffer_load_ubyte v1, v[1:2], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s10, -1
|
||||
; SI-NEXT: s_mov_b32 s8, s4
|
||||
; SI-NEXT: s_mov_b32 s9, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
||||
; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i8_vgpr:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s9
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v1
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
|
||||
; VI-NEXT: flat_load_ubyte v2, v[2:3]
|
||||
; VI-NEXT: flat_load_ubyte v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i8_vgpr:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_ubyte v3, v1, s[8:9]
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
|
||||
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
@ -95,13 +389,45 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
|
||||
; SIVI-NOT: and
|
||||
; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
|
||||
; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]],
|
||||
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
||||
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
||||
define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
; SI-LABEL: test_umulhi24_i32_i64:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umulhi24_i32_i64:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s0, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umulhi24_i32_i64:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s3, s0
|
||||
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%a.24 = and i32 %a, 16777215
|
||||
%b.24 = and i32 %b, 16777215
|
||||
@ -114,13 +440,55 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umulhi24:
|
||||
; SIVI-NOT: and
|
||||
; SIVI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
|
||||
; GFX9: s_mul_hi_u32 [[SRESULT:s[0-9]+]],
|
||||
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
||||
; GCN-NEXT: buffer_store_dword [[RESULT]]
|
||||
define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||
; SI-LABEL: test_umulhi24:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_load_dword s7, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umulhi24:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dword s7, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umulhi24:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
|
||||
; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0xffffff
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_and_b32 s5, s6, s4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s4, s7, s4
|
||||
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%a.24 = and i64 %a, 16777215
|
||||
%b.24 = and i64 %b, 16777215
|
||||
@ -132,14 +500,67 @@ entry:
|
||||
}
|
||||
|
||||
; Multiply with 24-bit inputs and 64-bit output.
|
||||
; FUNC-LABEL: {{^}}test_umul24_i64:
|
||||
; GCN-NOT: lshr
|
||||
; SIVI-DAG: s_mul_i32
|
||||
; SIVI-DAG: v_mul_hi_u32_u24_e32
|
||||
; GFX9-DAG: s_mul_i32
|
||||
; GFX9-DAG: s_mul_hi_u32
|
||||
; GCN: buffer_store_dwordx2
|
||||
define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||
; SI-LABEL: test_umul24_i64:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_load_dword s7, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_mov_b32 s8, 0xffffff
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_and_b32 s4, s6, s8
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s5, s7, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; SI-NEXT: s_mul_i32 s4, s4, s5
|
||||
; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i64:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dword s7, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s4, 0xffffff
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_and_b32 s5, s6, s4
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s4, s7, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: s_mul_i32 s5, s5, s4
|
||||
; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i64:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34
|
||||
; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0xffffff
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_and_b32 s5, s6, s4
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s4, s7, s4
|
||||
; GFX9-NEXT: s_mul_hi_u32 s6, s5, s4
|
||||
; GFX9-NEXT: s_mul_i32 s5, s5, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = shl i64 %a, 40
|
||||
%a_24 = lshr i64 %tmp0, 40
|
||||
@ -150,14 +571,49 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i64_square:
|
||||
; GCN: s_load_dword [[A:s[0-9]+]]
|
||||
; GCN: s_and_b32 [[B:s[0-9]+]], [[A]], 0xffffff
|
||||
; SIVI-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||
; SIVI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
||||
; GFX9-DAG: s_mul_i32 s{{[0-9]+}}, [[B]], [[B]]
|
||||
; GFX9-DAG: s_mul_hi_u32 s{{[0-9]+}}, [[B]], [[B]]
|
||||
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
||||
; SI-LABEL: test_umul24_i64_square:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0x13
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s5, s4, 0xffffff
|
||||
; SI-NEXT: s_mul_i32 s5, s5, s5
|
||||
; SI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i64_square:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x4c
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s1, s0, 0xffffff
|
||||
; VI-NEXT: s_mul_i32 s1, s1, s1
|
||||
; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i64_square:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff
|
||||
; GFX9-NEXT: s_mul_hi_u32 s1, s0, s0
|
||||
; GFX9-NEXT: s_mul_i32 s0, s0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = shl i64 %a, 40
|
||||
%a.24 = lshr i64 %tmp0, 40
|
||||
@ -166,14 +622,52 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umulhi16_i32:
|
||||
; GCN: s_and_b32
|
||||
; GCN: s_and_b32
|
||||
; GCN: s_mul_i32 [[MUL24:s[0-9]+]]
|
||||
; SIVI: s_lshr_b32 s{{[0-9]+}}, [[MUL24]], 16
|
||||
; GFX9: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[MUL24]]
|
||||
; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[RESULT]]
|
||||
define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
; SI-LABEL: test_umulhi16_i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s4, s4, s2
|
||||
; SI-NEXT: s_and_b32 s2, s5, s2
|
||||
; SI-NEXT: s_mul_i32 s4, s4, s2
|
||||
; SI-NEXT: s_lshr_b32 s4, s4, 16
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umulhi16_i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; VI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s0, s0, s2
|
||||
; VI-NEXT: s_and_b32 s1, s1, s2
|
||||
; VI-NEXT: s_mul_i32 s0, s0, s1
|
||||
; VI-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umulhi16_i32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0xffff
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s1, s4, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s5, s0
|
||||
; GFX9-NEXT: s_mul_i32 s0, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[2:3]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%a.16 = and i32 %a, 65535
|
||||
%b.16 = and i32 %b, 65535
|
||||
@ -184,21 +678,63 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i33:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; GCN-NOT: lshr
|
||||
; SIVI-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
|
||||
; SIVI-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
|
||||
; SIVI-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
||||
; SIVI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
|
||||
; GFX9-DAG: s_mul_i32 s[[MUL_LO:[0-9]+]],
|
||||
; GFX9-DAG: s_mul_hi_u32 s[[MUL_HI:[0-9]+]],
|
||||
; GFX9-DAG: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1
|
||||
; GFX9-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[MUL_LO]]
|
||||
; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
|
||||
define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
|
||||
; SI-LABEL: test_umul24_i33:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s1, 0xffffff
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s3, s2, s1
|
||||
; SI-NEXT: s_and_b32 s1, s0, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
|
||||
; SI-NEXT: s_mul_i32 s3, s3, s1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umul24_i33:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s1, 0xffffff
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_and_b32 s3, s2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_and_b32 s1, s0, s1
|
||||
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
|
||||
; VI-NEXT: s_mul_i32 s3, s3, s1
|
||||
; VI-NEXT: v_and_b32_e32 v1, 1, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umul24_i33:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s3, s0
|
||||
; GFX9-NEXT: s_mul_i32 s2, s1, s0
|
||||
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = shl i33 %a, 9
|
||||
%a_24 = lshr i33 %tmp0, 9
|
||||
@ -210,18 +746,51 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umulhi24_i33:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SIVI-NOT: and
|
||||
; GCN-NOT: lshr
|
||||
; SIVI: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
|
||||
; SIVI: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
|
||||
; GFX9: s_mul_hi_u32 s[[MUL_HI:[0-9]+]],
|
||||
; GFX9: s_and_b32 s[[AND_HI:[0-9]+]], s[[MUL_HI]], 1
|
||||
; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]], s[[AND_HI]]
|
||||
; GCN-NEXT: buffer_store_dword v[[HI]]
|
||||
define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
|
||||
; SI-LABEL: test_umulhi24_i33:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dword s4, s[0:1], 0xd
|
||||
; SI-NEXT: s_load_dword s5, s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
|
||||
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_umulhi24_i33:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0
|
||||
; VI-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: test_umulhi24_i33:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0xffffff
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s3, s0
|
||||
; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = shl i33 %a, 9
|
||||
%a_24 = lshr i33 %tmp0, 9
|
||||
@ -237,15 +806,16 @@ entry:
|
||||
|
||||
; Make sure the created any_extend is ignored to use the real bits
|
||||
; being multiplied.
|
||||
|
||||
; GCN-LABEL: {{^}}test_umul24_anyextend_i24_src0_src1:
|
||||
; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
|
||||
; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
|
||||
; GCN: v_mul_u32_u24_e32 v0, v0, v1
|
||||
; GCN: v_and_b32_e32 v0, 0x1fffe, v0
|
||||
; GCN: v_mul_u32_u24_e32 v0, 0x63, v0
|
||||
; GCN: s_setpc_b64
|
||||
define i17 @test_umul24_anyextend_i24_src0_src1(i24 %a, i24 %b) {
|
||||
; GCN-LABEL: test_umul24_anyextend_i24_src0_src1:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%aa = mul i24 %a, 234
|
||||
%bb = mul i24 %b, 923
|
||||
@ -257,19 +827,21 @@ entry:
|
||||
ret i17 %arst
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_umul24_anyextend_i23_src0_src1:
|
||||
; GCN: s_mov_b32 [[U23_MASK:s[0-9]+]], 0x7fffff
|
||||
; GCN-DAG: v_and_b32_e32 v0, [[U23_MASK]], v0
|
||||
; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1
|
||||
; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
|
||||
; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
|
||||
; GCN-DAG: v_and_b32_e32 v1, s4, v1
|
||||
; GCN-DAG: v_and_b32_e32 v0, 0x7ffffe, v0
|
||||
; GCN: v_mul_u32_u24_e32 v0, v0, v1
|
||||
; GCN: v_and_b32_e32 v0, 0x1fffe, v0
|
||||
; GCN: v_mul_u32_u24_e32 v0, 0x63, v0
|
||||
; GCN: s_setpc_b64
|
||||
define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
|
||||
; GCN-LABEL: test_umul24_anyextend_i23_src0_src1:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s4, 0x7fffff
|
||||
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0xea, v0
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v1, 0x39b, v1
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x7ffffe, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, s4, v1
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0x1fffe, v0
|
||||
; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x63, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%aa = mul i23 %a, 234
|
||||
%bb = mul i23 %b, 923
|
||||
|
@ -1,10 +1,37 @@
|
||||
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=FUNC,R600,CM %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=FUNC,R600,EG %s
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=CM %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umul24_i32:
|
||||
; CM: MULLO_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]}}.W, T{{[0-9]}}.Z
|
||||
; EG: MULLO_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PS, PV.W
|
||||
define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
; CM-LABEL: test_umul24_i32:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
|
||||
; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.y,
|
||||
; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
|
||||
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
|
||||
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
|
||||
;
|
||||
; EG-LABEL: test_umul24_i32:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.x,
|
||||
; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
%0 = shl i32 %a, 8
|
||||
%a_24 = lshr i32 %0, 8
|
||||
@ -16,12 +43,48 @@ entry:
|
||||
}
|
||||
|
||||
; The result must be sign-extended.
|
||||
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
|
||||
; R600: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
|
||||
; CM: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
|
||||
; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PS, 0.0, literal.x
|
||||
; R600: 16
|
||||
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
||||
; CM-LABEL: test_umul24_i16_sext:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 0, @10, KC0[], KC1[]
|
||||
; CM-NEXT: TEX 1 @6
|
||||
; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: Fetch clause starting at 6:
|
||||
; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
|
||||
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
|
||||
; CM-NEXT: ALU clause starting at 10:
|
||||
; CM-NEXT: MOV * T0.X, 0.0,
|
||||
; CM-NEXT: ALU clause starting at 11:
|
||||
; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X,
|
||||
; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X,
|
||||
; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X,
|
||||
; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X,
|
||||
; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x,
|
||||
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; EG-LABEL: test_umul24_i16_sext:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
|
||||
; EG-NEXT: TEX 1 @6
|
||||
; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: Fetch clause starting at 6:
|
||||
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
|
||||
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
|
||||
; EG-NEXT: ALU clause starting at 10:
|
||||
; EG-NEXT: MOV * T0.X, 0.0,
|
||||
; EG-NEXT: ALU clause starting at 11:
|
||||
; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
|
||||
; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
||||
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
|
||||
entry:
|
||||
%mul = mul i16 %a, %b
|
||||
%ext = sext i16 %mul to i32
|
||||
@ -30,11 +93,48 @@ entry:
|
||||
}
|
||||
|
||||
; The result must be sign-extended.
|
||||
; FUNC-LABEL: {{^}}test_umul24_i8:
|
||||
; R600: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
|
||||
; CM: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
|
||||
; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PS, 0.0, literal.x
|
||||
define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
|
||||
; CM-LABEL: test_umul24_i8:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 0, @10, KC0[], KC1[]
|
||||
; CM-NEXT: TEX 1 @6
|
||||
; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: Fetch clause starting at 6:
|
||||
; CM-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
|
||||
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
|
||||
; CM-NEXT: ALU clause starting at 10:
|
||||
; CM-NEXT: MOV * T0.X, 0.0,
|
||||
; CM-NEXT: ALU clause starting at 11:
|
||||
; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X,
|
||||
; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X,
|
||||
; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X,
|
||||
; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X,
|
||||
; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x,
|
||||
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
||||
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; EG-LABEL: test_umul24_i8:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 0, @10, KC0[], KC1[]
|
||||
; EG-NEXT: TEX 1 @6
|
||||
; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: Fetch clause starting at 6:
|
||||
; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3
|
||||
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3
|
||||
; EG-NEXT: ALU clause starting at 10:
|
||||
; EG-NEXT: MOV * T0.X, 0.0,
|
||||
; EG-NEXT: ALU clause starting at 11:
|
||||
; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X,
|
||||
; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
||||
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
|
||||
entry:
|
||||
%mul = mul i8 %a, %b
|
||||
%ext = sext i8 %mul to i32
|
||||
@ -42,9 +142,31 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
|
||||
; R600: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
|
||||
define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
; CM-LABEL: test_umulhi24_i32_i64:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].Z, KC0[2].W,
|
||||
; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
|
||||
; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
|
||||
; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
|
||||
;
|
||||
; EG-LABEL: test_umulhi24_i32_i64:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].Z, KC0[2].W,
|
||||
entry:
|
||||
%a.24 = and i32 %a, 16777215
|
||||
%b.24 = and i32 %b, 16777215
|
||||
@ -57,9 +179,31 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_umulhi24:
|
||||
; R600: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
|
||||
define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||
; CM-LABEL: test_umulhi24:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
|
||||
;
|
||||
; EG-LABEL: test_umulhi24:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].W, KC0[3].Y,
|
||||
entry:
|
||||
%a.24 = and i64 %a, 16777215
|
||||
%b.24 = and i64 %b, 16777215
|
||||
@ -71,10 +215,42 @@ entry:
|
||||
}
|
||||
|
||||
; Multiply with 24-bit inputs and 64-bit output.
|
||||
; FUNC-LABEL: {{^}}test_umul24_i64:
|
||||
; EG; MUL_UINT24
|
||||
; R600: MULHI
|
||||
define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||
; CM-LABEL: test_umul24_i64:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: AND_INT * T0.Z, KC0[3].Y, literal.y,
|
||||
; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38)
|
||||
; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x,
|
||||
; CM-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
|
||||
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
|
||||
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
|
||||
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
|
||||
; CM-NEXT: MULHI_UINT24 T1.X (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_UINT24 T1.Y, KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
|
||||
; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
|
||||
;
|
||||
; EG-LABEL: test_umul24_i64:
|
||||
; EG: ; %bb.0: ; %entry
|
||||
; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
|
||||
; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00)
|
||||
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
; EG-NEXT: MULHI_UINT24 * T1.Y, KC0[2].W, KC0[3].Y,
|
||||
entry:
|
||||
%tmp0 = shl i64 %a, 40
|
||||
%a_24 = lshr i64 %tmp0, 40
|
||||
|
Loading…
x
Reference in New Issue
Block a user