mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
f7e914e2c5
This patch adjusts the following ARM/AArch64 LLVM IR intrinsics: - neon_bfmmla - neon_bfmlalb - neon_bfmlalt so that they take and return bf16 and float types. Previously these intrinsics used <8 x i8> and <4 x i8> vectors (a rudiment from implementation lacking bf16 IR type). The neon_vbfdot[q] intrinsics are adjusted similarly. This change required some additional selection patterns for vbfdot itself and also for vector shuffles (in a previous patch) because of SelectionDAG transformations kicking in and mangling the original code. This patch makes the generated IR cleaner (less useless bitcasts are produced), but it does not affect the final assembly. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D86146
171 lines
7.5 KiB
LLVM
171 lines
7.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple armv8.6a-arm-none-eabi -mattr=+neon,+bf16 -float-abi=hard -verify-machineinstrs < %s -o - | FileCheck %s
|
|
|
|
define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfdot_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vdot.bf16 d0, d1, d2
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
|
|
ret <2 x float> %vbfdot3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfdotq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vdot.bf16 q0, q1, q2
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
|
|
ret <4 x float> %vbfdot3.i
|
|
}
|
|
|
|
define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfdot_lane_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vdot.bf16 d0, d1, d2[0]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
|
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
|
|
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
|
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
|
|
ret <2 x float> %vbfdot3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfdotq_laneq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vdup.32 q8, d5[1]
|
|
; CHECK-NEXT: vdot.bf16 q0, q1, q8
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
|
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
|
|
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
|
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
|
|
ret <4 x float> %vbfdot3.i
|
|
}
|
|
|
|
define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfdot_laneq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vdot.bf16 d0, d1, d3[1]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%.cast = bitcast <8 x bfloat> %b to <4 x float>
|
|
%lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
|
|
%.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
|
|
%vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
|
|
ret <2 x float> %vbfdot3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfdotq_lane_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
|
|
; CHECK-NEXT: vdot.bf16 q0, q1, d4[0]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%.cast = bitcast <4 x bfloat> %b to <2 x float>
|
|
%lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
|
|
%.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
|
|
%vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
|
|
ret <4 x float> %vbfdot3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmmlaq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vmmla.bf16 q0, q1, q2
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
|
ret <4 x float> %vbfmmlaq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlalbq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vfmab.bf16 q0, q1, q2
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
|
ret <4 x float> %vbfmlalbq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlaltq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vfmat.bf16 q0, q1, q2
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
|
|
ret <4 x float> %vbfmlaltq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlalbq_lane_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
|
|
; CHECK-NEXT: vfmab.bf16 q0, q1, d4[0]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
|
ret <4 x float> %vbfmlalbq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlalbq_laneq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vfmab.bf16 q0, q1, d4[3]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
%vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
|
ret <4 x float> %vbfmlalbq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlaltq_lane_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
|
|
; CHECK-NEXT: vfmat.bf16 q0, q1, d4[0]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
|
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
|
ret <4 x float> %vbfmlaltq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlaltq_laneq_f32:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vfmat.bf16 q0, q1, d4[3]
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
%vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
|
ret <4 x float> %vbfmlaltq_v3.i
|
|
}
|
|
|
|
define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
|
|
; CHECK-LABEL: test_vbfmlaltq_laneq_f32_v2:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: vdup.16 q8, d5[2]
|
|
; CHECK-NEXT: vfmat.bf16 q0, q1, q8
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
|
|
%vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
|
|
ret <4 x float> %vbfmlalt1.i
|
|
}
|
|
|
|
declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
|
|
declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
|
declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
|
declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|
|
declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
|