From 30ee20b16b18ba86c450bc695f89e5e948067b87 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Tue, 17 Feb 2015 09:20:12 +0000 Subject: [PATCH] AVX-512: changes in intel_ocl_bi calling conventions - added mask types v8i1 and v16i1 to possible function parameters - enabled passing 512-bit vectors in standard CC - added a test for KNL intel_ocl_bi conventions llvm-svn: 229482 --- lib/Target/X86/X86CallingConv.td | 11 +++ lib/Target/X86/X86InstrAVX512.td | 36 +++++---- test/CodeGen/X86/avx512-intel-ocl.ll | 105 +++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 14 deletions(-) create mode 100644 test/CodeGen/X86/avx512-intel-ocl.ll diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 75a2ec00468..41c759a52ee 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -461,6 +461,10 @@ def CC_X86_32_Common : CallingConv<[ CCIfSubtarget<"hasFp256()", CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + // The first 4 AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg>>, + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, @@ -468,6 +472,10 @@ def CC_X86_32_Common : CallingConv<[ CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCAssignToStack<32, 32>>, + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>>, + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are // passed in the parameter area. CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>; @@ -626,6 +634,9 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v16f32, v8f64, v16i32, v8i64], CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + // Pass masks in mask registers + CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, CCIfSubtarget<"is64Bit()", CCDelegateTo>, CCDelegateTo diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index dec5ba01542..43c0f45ed11 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1604,14 +1604,14 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), // multiclass avx512_mask_mov opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, - ValueType vvt, ValueType ivt, X86MemOperand x86memop> { + ValueType vvt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I; let mayLoad = 1 in def km : I; + [(set KRC:$dst, (vvt (load addr:$src)))]>; let mayStore = 1 in def mk : I opc_kr, bits<8> opc_rk, } let Predicates = [HasDQI] in - defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8, - i8mem>, + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>, avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, VEX, PD; let Predicates = [HasAVX512] in - defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16, - i16mem>, + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, VEX, PS; let Predicates = [HasBWI] in { - defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32, - i32mem>, VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>, + VEX, PD, VEX_W; defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, VEX, XD; } let Predicates = [HasBWI] in { - defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64, - i64mem>, VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, + VEX, PS, VEX_W; defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, VEX, XD, VEX_W; } @@ -1682,24 +1680,34 @@ let Predicates = [HasBWI] in { let Predicates = [HasDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVBmk addr:$dst, VK8:$src)>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (KMOVBkm addr:$src)>; +} +let Predicates = [HasAVX512, NoDQI] in { + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; } let Predicates = [HasAVX512] in { def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), - (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; def : Pat<(i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; - def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), - (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; + def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), + (KMOVWkm addr:$src)>; } let Predicates = [HasBWI] in { def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), (KMOVDmk addr:$dst, VK32:$src)>; + def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))), + (KMOVDkm addr:$src)>; } let Predicates = [HasBWI] in { def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), (KMOVQmk addr:$dst, VK64:$src)>; + def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))), + (KMOVQkm addr:$src)>; } let Predicates = [HasAVX512] in { diff --git a/test/CodeGen/X86/avx512-intel-ocl.ll b/test/CodeGen/X86/avx512-intel-ocl.ll new file mode 100644 index 00000000000..3f2691ba299 --- /dev/null +++ b/test/CodeGen/X86/avx512-intel-ocl.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck -check-prefix=X32 %s +; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=knl | FileCheck -check-prefix=X32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck -check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck -check-prefix=X64 %s + +declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *) +declare <16 x float> @func_float16(<16 x float>, <16 x float>) +declare i32 @func_int(i32, i32) + +; WIN64-LABEL: testf16_inp +; WIN64: vaddps {{.*}}, {{%zmm[0-1]}} +; WIN64: leaq {{.*}}(%rsp), %rcx +; WIN64: call +; WIN64: ret + +; X32-LABEL: testf16_inp +; X32: vaddps {{.*}}, {{%zmm[0-1]}} +; X32: movl %eax, (%esp) +; X32: call +; X32: ret + +; X64-LABEL: testf16_inp +; X64: vaddps {{.*}}, {{%zmm[0-1]}} +; X64: leaq {{.*}}(%rsp), %rdi +; X64: call +; X64: ret + +;test calling conventions - input parameters +define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind { + %y = alloca <16 x float>, align 16 + %x = fadd <16 x float> %a, %b + %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) + %2 = load <16 x float>* %y, align 16 + %3 = fadd <16 x float> %2, %1 + ret <16 x float> %3 +} + +;test calling conventions - preserved registers + +; preserved zmm16- +; WIN64-LABEL: testf16_regs +; WIN64: call +; WIN64: vaddps %zmm16, %zmm0, %zmm0 +; WIN64: ret + +; preserved zmm16- +; X64-LABEL: testf16_regs +; X64: call +; X64: vaddps %zmm16, %zmm0, %zmm0 +; X64: ret + +define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { + %y = alloca <16 x float>, align 16 + %x = fadd <16 x float> %a, %b + %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) + %2 = load <16 x float>* %y, align 16 + %3 = fadd <16 x float> %1, %b + %4 = fadd <16 x float> %2, %3 + ret <16 x float> %4 +} + +; test calling conventions - prolog and epilog +; WIN64-LABEL: test_prolog_epilog +; WIN64: vmovups %zmm21, {{.*(%rbp).*}} # 64-byte Spill +; WIN64: vmovups %zmm6, {{.*(%rbp).*}} # 64-byte Spill +; WIN64: call +; WIN64: vmovups {{.*(%rbp).*}}, %zmm6 # 64-byte Reload +; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload + +; X64-LABEL: test_prolog_epilog +; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill +; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill +; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill +; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill +; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill +; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill +; X64: call +; X64: vmovups {{.*}}(%rsp), %zmm16 ## 64-byte Reload +; X64: vmovups {{.*}}(%rsp), %zmm31 ## 64-byte Reload +define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { + %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) + ret <16 x float> %c +} + + +declare <16 x float> @func_float16_mask(<16 x float>, <16 x i1>) + +; X64-LABEL: testf16_inp_mask +; X64: kmovw %edi, %k1 +; X64: call +define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask) { + %imask = bitcast i16 %mask to <16 x i1> + %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask) + ret <16 x float> %1 +} + +; X64-LABEL: test_prolog_epilog_with_mask +; X64: kxorw %k{{.*}}, %k{{.*}}, %k1 +; X64: call +define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a, <16 x i32> %x1, <16 x i32>%x2, <16 x i1> %mask) nounwind { + %cmp_res = icmp eq <16 x i32>%x1, %x2 + %mask1 = xor <16 x i1> %cmp_res, %mask + %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1) + ret <16 x float> %c +} \ No newline at end of file