[ARM,CDE] Implement CDE unpredicated Q-register intrinsics

Summary: This patch implements the following intrinsics: uint8x16_t __arm_vcx1q_u8 (int coproc, uint32_t imm); T __arm_vcx1qa(int coproc, T acc, uint32_t imm); T __arm_vcx2q(int coproc, T n, uint32_t imm); uint8x16_t __arm_vcx2q_u8(int coproc, T n, uint32_t imm); T __arm_vcx2qa(int coproc, T acc, U n, uint32_t imm); T __arm_vcx3q(int coproc, T n, U m, uint32_t imm); uint8x16_t __arm_vcx3q_u8(int coproc, T n, U m, uint32_t imm); T __arm_vcx3qa(int coproc, T acc, U n, V m, uint32_t imm); Most of them are polymorphic. Furthermore, some intrinsics are polymorphic by 2 or 3 parameter types, such polymorphism is not supported by the existing MVE/CDE tablegen backends, also we don't really want to have a combinatorial explosion caused by 1000 different combinations of 3 vector types. Because of this some intrinsics are implemented as macros involving a cast of the polymorphic arguments to uint8x16_t. The IR intrinsics are even more restricted in terms of types: all MVE vectors are cast to v16i8. Reviewers: simon_tatham, MarkMurrayARM, dmgreen, ostannard Reviewed By: MarkMurrayARM Subscribers: kristof.beyls, hiraditya, danielkiss, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76299
2024-11-26 12:43:36 +01:00 · 2020-03-20 14:01:56 +00:00 · 2020-03-20 14:01:56 +00:00 · 4fa24484ce
commit 4fa24484ce
parent 821822dd9a
3 changed files with 155 additions and 0 deletions
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@ -1317,4 +1317,20 @@ defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>;
 defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>;
 defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>;

+multiclass CDEVCXVecIntrinsics<list<LLVMType> args> {
+  def "" : Intrinsic<
+    [llvm_v16i8_ty],
+    !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 1)>]>;
+  def a : Intrinsic<
+    [llvm_v16i8_ty],
+    !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */],
+                args, [llvm_i32_ty /* imm */]),
+    [IntrNoMem, ImmArg<0>, ImmArg<!add(!size(args), 2)>]>;
+}
+
+defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>;
+defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>;
+defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>;
+
 } // end TargetPrefix
--- a/lib/Target/ARM/ARMInstrCDE.td
+++ b/lib/Target/ARM/ARMInstrCDE.td
@ -581,3 +581,28 @@ let Predicates = [HasCDE, HasFPRegs] in {
            (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
                                 imm_3b:$imm))>;
 }
+
+let Predicates = [HasCDE, HasMVEInt] in {
+  def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
+            (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
+  def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
+                                       timm:$imm)),
+            (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
+
+  def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
+            (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
+  def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
+                                       (v16i8 MQPR:$n), timm:$imm)),
+            (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
+                                  imm_7b:$imm))>;
+
+  def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
+                                      (v16i8 MQPR:$m), timm:$imm)),
+            (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
+                                 imm_4b:$imm))>;
+  def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
+                                       (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+                                       timm:$imm)),
+            (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
+                                  imm_4b:$imm))>;
+}
--- a/test/CodeGen/Thumb2/cde-vec.ll
+++ b/test/CodeGen/Thumb2/cde-vec.ll
@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg)
+declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() {
+; CHECK-LABEL: test_vcx1q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx1 p0, q0, #1111
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) {
+; CHECK-LABEL: test_vcx1qa_1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx1a p1, q0, #1112
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112)
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) {
+; CHECK-LABEL: test_vcx1qa_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx1a p0, q0, #1113
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x i32> %acc to <16 x i8>
+  %1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113)
+  %2 = bitcast <16 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) {
+; CHECK-LABEL: test_vcx2q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx2 p1, q0, q0, #111
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x half> %n to <16 x i8>
+  %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111)
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) {
+; CHECK-LABEL: test_vcx2q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx2 p1, q0, q0, #112
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %n to <16 x i8>
+  %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112)
+  %2 = bitcast <16 x i8> %1 to <4 x float>
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) {
+; CHECK-LABEL: test_vcx2qa:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx2a p0, q0, q1, #113
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %acc to <16 x i8>
+  %1 = bitcast <2 x i64> %n to <16 x i8>
+  %2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113)
+  %3 = bitcast <16 x i8> %2 to <4 x float>
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) {
+; CHECK-LABEL: test_vcx3q_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx3 p0, q0, q0, q1, #11
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x i16> %n to <16 x i8>
+  %1 = bitcast <4 x i32> %m to <16 x i8>
+  %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) {
+; CHECK-LABEL: test_vcx3q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx3 p1, q0, q0, q1, #12
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <2 x i64> %n to <16 x i8>
+  %1 = bitcast <4 x float> %m to <16 x i8>
+  %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12)
+  %3 = bitcast <16 x i8> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) {
+; CHECK-LABEL: test_vcx3qa:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcx3a p1, q0, q1, q2, #13
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <8 x i16> %n to <16 x i8>
+  %1 = bitcast <4 x float> %m to <16 x i8>
+  %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
+  ret <16 x i8> %2
+}