From 2d458d2ff7fd2f7e4aaad89baa1cec25528c3b73 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 4 Jul 2016 14:19:05 +0000 Subject: [PATCH] [X86][AVX512] Autoupgrade the VPERMPD/VPERMQ intrinsics llvm-svn: 274506 --- lib/IR/AutoUpgrade.cpp | 18 ++++++++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 40 ++++++++++++++++ test/CodeGen/X86/avx512-intrinsics.ll | 40 ---------------- .../X86/avx512vl-intrinsics-upgrade.ll | 46 +++++++++++++++++++ test/CodeGen/X86/avx512vl-intrinsics.ll | 45 ------------------ 5 files changed, 104 insertions(+), 85 deletions(-) diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 65cc2d4f858..160b5cadcbf 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -226,6 +226,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.avx512.mask.pshufl.w.") || Name.startswith("x86.avx512.mask.pshufh.w.") || Name.startswith("x86.avx512.mask.vpermil.p") || + Name.startswith("x86.avx512.mask.perm.df.") || + Name.startswith("x86.avx512.mask.perm.di.") || Name.startswith("x86.avx512.mask.punpckl") || Name.startswith("x86.avx512.mask.punpckh") || Name.startswith("x86.avx512.mask.unpckl.") || @@ -1006,6 +1008,22 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateShuffleVector(Op0, UndefV, Idxs); } else if (Name == "llvm.stackprotectorcheck") { Rep = nullptr; + } else if (Name.startswith("llvm.x86.avx512.mask.perm.df.") || + Name.startswith("llvm.x86.avx512.mask.perm.di.")) { + Value *Op0 = CI->getArgOperand(0); + unsigned Imm = cast(CI->getArgOperand(1))->getZExtValue(); + VectorType *VecTy = cast(CI->getType()); + unsigned NumElts = VecTy->getNumElements(); + + SmallVector Idxs(NumElts); + for (unsigned i = 0; i != NumElts; ++i) + Idxs[i] = (i & ~0x3) + ((Imm >> (2 * (i & 0x3))) & 3); + + Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs); + + if (CI->getNumArgOperands() == 4) + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, + CI->getArgOperand(2)); } else if (Name.startswith("llvm.x86.avx.vpermil.") || Name == "llvm.x86.sse2.pshuf.d" || Name.startswith("llvm.x86.avx512.mask.vpermil.p") || diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 0cc8a582805..37a5116ce45 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -61,6 +61,46 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ret <8 x double> %res4 } +declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) + %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) + %res3 = fadd <8 x double> %res, %res1 + %res4 = fadd <8 x double> %res3, %res2 + ret <8 x double> %res4 +} + +declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) + %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) + %res3 = add <8 x i64> %res, %res1 + %res4 = add <8 x i64> %res3, %res2 + ret <8 x i64> %res4 +} + define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { ; CHECK-LABEL: test_store1: ; CHECK: ## BB#0: diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index d9f14b292ec..bc1f33488ca 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -6306,46 +6306,6 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> ret <8 x i64> %res4 } -declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpermpd {{.*#+}} zmm2 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 -} - -declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) - -define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpermq {{.*#+}} zmm2 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,0,0,7,4,4,4] -; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 -} - declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 3eeb85db8c5..eccb578b4ca 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -230,6 +230,52 @@ define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x ret <4 x float> %res4 } +declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x01,0xd0,0x03] +; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0] +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] +; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0] +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] +; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0] +; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) + %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) + %res3 = fadd <4 x double> %res, %res1 + %res4 = fadd <4 x double> %res3, %res2 + ret <4 x double> %res4 +} + +declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x00,0xd0,0x03] +; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0] +; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] +; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0] +; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] +; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0] +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) + %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) + %res3 = add <4 x i64> %res, %res1 + %res4 = add <4 x i64> %res3, %res2 + ret <4 x i64> %res4 +} + declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8) define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) { diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 88a80d6ed49..e60e774c111 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -7977,51 +7977,6 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> ret <4 x i64> %res4 } -declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x double>, i8) - -define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0] -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xd0,0x03] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[3,0,0,0] -; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x01,0xc0,0x03] -; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xca] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res3, %res2 - ret <4 x double> %res4 -} - -declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] -; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0] -; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xd0,0x03] -; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[3,0,0,0] -; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x00,0xc0,0x03] -; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0] -; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 -} declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {