AVX-512: changes in intel_ocl_bi calling conventions

- added mask types v8i1 and v16i1 to possible function parameters - enabled passing 512-bit vectors in standard CC - added a test for KNL intel_ocl_bi conventions llvm-svn: 229482
2024-11-24 03:33:20 +01:00 · 2015-02-17 09:20:12 +00:00 · 2015-02-17 09:20:12 +00:00 · 30ee20b16b
commit 30ee20b16b
parent 6f0aac0467
3 changed files with 138 additions and 14 deletions
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[
                CCIfSubtarget<"hasFp256()",
                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,

+  // The first 4 AVX 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+                CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
+
  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,

@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[
  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
           CCAssignToStack<32, 32>>,

+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>,
+
  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
  // passed in the parameter area.
  CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[
  CCIfType<[v16f32, v8f64, v16i32, v8i64],
           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,

+  // Pass masks in mask registers
+  CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>,
+
  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
  CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
  CCDelegateTo<CC_X86_32_C>
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -1604,14 +1604,14 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1),
 //
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                         string OpcodeStr, RegisterClass KRC,
-                         ValueType vvt, ValueType ivt, X86MemOperand x86memop> {
+                         ValueType vvt, X86MemOperand x86memop> {
  let hasSideEffects = 0 in {
    def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
    let mayLoad = 1 in
    def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-               [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
+               [(set KRC:$dst, (vvt (load addr:$src)))]>;
    let mayStore = 1 in
    def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@ -1631,27 +1631,25 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
 }

 let Predicates = [HasDQI] in
-  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8,
-                               i8mem>,
+  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
               avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
               VEX, PD;

 let Predicates = [HasAVX512] in
-  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16,
-                               i16mem>,
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
               avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
               VEX, PS;

 let Predicates = [HasBWI] in {
-  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32,
-                               i32mem>, VEX, PD, VEX_W;
+  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
+               VEX, PD, VEX_W;
  defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
               VEX, XD;
 }

 let Predicates = [HasBWI] in {
-  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64,
-                               i64mem>, VEX, PS, VEX_W;
+  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
+               VEX, PS, VEX_W;
  defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
               VEX, XD, VEX_W;
 }
@ -1682,24 +1680,34 @@ let Predicates = [HasBWI] in {
 let Predicates = [HasDQI] in {
  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
            (KMOVBmk addr:$dst, VK8:$src)>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (KMOVBkm addr:$src)>;
+}
+let Predicates = [HasAVX512, NoDQI] in {
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
+            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
 }
 let Predicates = [HasAVX512] in {
  def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
            (KMOVWmk addr:$dst, VK16:$src)>;
-  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
-            (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
  def : Pat<(i1 (load addr:$src)),
            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
-  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
-            (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+  def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
+            (KMOVWkm addr:$src)>;
 }
 let Predicates = [HasBWI] in {
  def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
            (KMOVDmk addr:$dst, VK32:$src)>;
+  def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))),
+            (KMOVDkm addr:$src)>;
 }
 let Predicates = [HasBWI] in {
  def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
            (KMOVQmk addr:$dst, VK64:$src)>;
+  def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))),
+            (KMOVQkm addr:$src)>;
 }

 let Predicates = [HasAVX512] in {
--- a/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/test/CodeGen/X86/avx512-intel-ocl.ll
@ -0,0 +1,105 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck -check-prefix=X64 %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+declare i32 @func_int(i32, i32)
+
+; WIN64-LABEL: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%zmm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; X32-LABEL: testf16_inp
+; X32: vaddps  {{.*}}, {{%zmm[0-1]}}
+; X32: movl    %eax, (%esp)
+; X32: call
+; X32: ret
+
+; X64-LABEL: testf16_inp
+; X64: vaddps  {{.*}}, {{%zmm[0-1]}}
+; X64: leaq    {{.*}}(%rsp), %rdi
+; X64: call
+; X64: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved zmm16-
+; WIN64-LABEL: testf16_regs
+; WIN64: call
+; WIN64: vaddps  %zmm16, %zmm0, %zmm0
+; WIN64: ret
+
+; preserved zmm16-
+; X64-LABEL: testf16_regs
+; X64: call
+; X64: vaddps  %zmm16, %zmm0, %zmm0
+; X64: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64-LABEL: test_prolog_epilog
+; WIN64: vmovups %zmm21, {{.*(%rbp).*}}     # 64-byte Spill
+; WIN64: vmovups %zmm6, {{.*(%rbp).*}}     # 64-byte Spill
+; WIN64: call
+; WIN64: vmovups {{.*(%rbp).*}}, %zmm6      # 64-byte Reload
+; WIN64: vmovups {{.*(%rbp).*}}, %zmm21     # 64-byte Reload
+
+; X64-LABEL: test_prolog_epilog
+; X64:  kmovw   %k7, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64:  kmovw   %k6, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64:  kmovw   %k5, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64:  kmovw   %k4, {{.*}}(%rsp)         ## 8-byte Folded Spill
+; X64: vmovups %zmm31, {{.*}}(%rsp)  ## 64-byte Spill
+; X64: vmovups %zmm16, {{.*}}(%rsp)  ## 64-byte Spill
+; X64: call
+; X64: vmovups {{.*}}(%rsp), %zmm16 ## 64-byte Reload
+; X64: vmovups {{.*}}(%rsp), %zmm31 ## 64-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
+
+
+declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>)
+
+; X64-LABEL: testf16_inp_mask
+; X64: kmovw   %edi, %k1
+; X64: call
+define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
+  %imask = bitcast i16 %mask to <16 x i1>
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
+  ret <16 x float> %1
+}
+
+; X64-LABEL: test_prolog_epilog_with_mask
+; X64: kxorw   %k{{.*}}, %k{{.*}}, %k1
+; X64: call
+define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind {
+   %cmp_res = icmp eq <16 x i32>%x1, %x2
+   %mask1 = xor <16 x i1> %cmp_res, %mask
+   %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
+   ret <16 x float> %c
+}