[X86][XOP] Support for VPERMIL2PD/VPERMIL2PS 2-input shuffle instructions

This patch begins adding support for lowering to the XOP VPERMIL2PD/VPERMIL2PS shuffle instructions - adding the X86ISD::VPERMIL2 opcode and cleaning up the usage. The internal llvm intrinsics were assuming the shuffle mask operand was the same type as the float/double input operands (I guess to simplify the intrinsic definitions in X86InstrXOP.td to a single value type). These needed changing to integer types (matching the clang builtin and the AMD intrinsics definitions), an auto upgrade path is added to convert old calls. Mask decoding/target shuffle support will be added in future patches. Differential Revision: http://reviews.llvm.org/D20049 llvm-svn: 271633
2025-01-31 12:41:49 +01:00 · 2016-06-03 08:06:03 +00:00 · 2016-06-03 08:06:03 +00:00 · 0c614eb08a
commit 0c614eb08a
parent 1981be1e4f
11 changed files with 214 additions and 85 deletions
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -3873,23 +3873,23 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".

  def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">,
              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                                          llvm_v2f64_ty, llvm_i8_ty],
+                                          llvm_v2i64_ty, llvm_i8_ty],
                        [IntrNoMem]>;

  def int_x86_xop_vpermil2pd_256 :
              GCCBuiltin<"__builtin_ia32_vpermil2pd256">,
              Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                                          llvm_v4f64_ty, llvm_i8_ty],
+                                          llvm_v4i64_ty, llvm_i8_ty],
                        [IntrNoMem]>;

  def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">,
              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                                          llvm_v4f32_ty, llvm_i8_ty],
+                                          llvm_v4i32_ty, llvm_i8_ty],
                        [IntrNoMem]>;
  def int_x86_xop_vpermil2ps_256 :
              GCCBuiltin<"__builtin_ia32_vpermil2ps256">,
              Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                                          llvm_v8f32_ty, llvm_i8_ty],
+                                          llvm_v8i32_ty, llvm_i8_ty],
                        [IntrNoMem]>;

  def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">,
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -282,6 +282,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
      NewFn = F;
      return true;
    }
+    // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
+    if (Name.startswith("x86.xop.vpermil2")) {
+      auto Params = F->getFunctionType()->params();
+      auto Idx = Params[2];
+      if (Idx->getScalarType()->isFloatingPointTy()) {
+        F->setName(Name + ".old");
+        unsigned IdxSize = Idx->getPrimitiveSizeInBits();
+        unsigned EltSize = Idx->getScalarSizeInBits();
+        Intrinsic::ID Permil2ID;
+        if (EltSize == 64 && IdxSize == 128)
+          Permil2ID = Intrinsic::x86_xop_vpermil2pd;
+        else if (EltSize == 32 && IdxSize == 128)
+          Permil2ID = Intrinsic::x86_xop_vpermil2ps;
+        else if (EltSize == 64 && IdxSize == 256)
+          Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
+        else
+          Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
+        NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
+        return true;
+      }
+    }
    break;
  }
  }
@ -911,6 +932,20 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
    CI->eraseFromParent();
    return;

+  case Intrinsic::x86_xop_vpermil2pd:
+  case Intrinsic::x86_xop_vpermil2ps:
+  case Intrinsic::x86_xop_vpermil2pd_256:
+  case Intrinsic::x86_xop_vpermil2ps_256: {
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+    VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
+    VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
+    Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name));
+    CI->eraseFromParent();
+    return;
+  }
+
  case Intrinsic::x86_sse41_ptestc:
  case Intrinsic::x86_sse41_ptestz:
  case Intrinsic::x86_sse41_ptestnzc: {
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -21947,6 +21947,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
+  case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
  case X86ISD::FMADD:              return "X86ISD::FMADD";
  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -451,6 +451,8 @@ namespace llvm {
      VPCOM, VPCOMU,
      // XOP packed permute bytes.
      VPPERM,
+      // XOP two source permutation.
+      VPERMIL2,

      // Vector multiply packed unsigned doubleword integers.
      PMULUDQ,
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@ -245,7 +245,12 @@ def X86vpcomu  : SDNode<"X86ISD::VPCOMU",
                        SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                             SDTCisSameAs<0,2>,
                                             SDTCisVT<3, i8>]>>;
-
+def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
+                        SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisSameSizeAs<0,3>,
+                                             SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisVT<4, i8>]>>;
 def X86vpperm : SDNode<"X86ISD::VPPERM",
                        SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                             SDTCisSameAs<0,2>]>>;
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@ -342,27 +342,34 @@ let Predicates = [HasXOP] in {
            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
 }

-multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
-                  Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                  ValueType vt128, ValueType vt256,
+                  ValueType id128, ValueType id256,
+                  PatFrag ld_128, PatFrag ld_256> {
  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
        !strconcat(OpcodeStr,
        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
        [(set VR128:$dst,
-           (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
  def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
+        (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
        !strconcat(OpcodeStr,
        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
        [(set VR128:$dst,
-           (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
+                          (id128 (bitconvert (loadv2i64 addr:$src3))),
+                          (i8 imm:$src4))))]>,
        VEX_W, MemOp4;
  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
        !strconcat(OpcodeStr,
        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
        [(set VR128:$dst,
-           (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+           (vt128 (OpNode (vt128 VR128:$src1),
+                          (vt128 (bitconvert (ld_128 addr:$src2))),
+                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
  // For disassembler
  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
  def rr_REV : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
@ -376,21 +383,24 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
        !strconcat(OpcodeStr,
        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
        [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
  def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
+        (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
        !strconcat(OpcodeStr,
        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
        [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
-        VEX_W, MemOp4, VEX_L;
+           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
+                          (id256 (bitconvert (loadv4i64 addr:$src3))),
+                          (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L;
  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
        !strconcat(OpcodeStr,
        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
        [(set VR256:$dst,
-           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
-        VEX_L;
+           (vt256 (OpNode (vt256 VR256:$src1),
+                          (vt256 (bitconvert (ld_256 addr:$src2))),
+                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
  // For disassembler
  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
  def rrY_REV : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
@ -401,10 +411,10 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
 }

 let ExeDomain = SSEPackedDouble in
-  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
-                           int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
+  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
+                           v2i64, v4i64, loadv2f64, loadv4f64>;

 let ExeDomain = SSEPackedSingle in
-  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
-                           int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
+  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
+                           v4i32, v8i32, loadv4f32, loadv8f32>;

--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@ -2234,6 +2234,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
  X86_INTRINSIC_DATA(xop_vpcomuq,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
  X86_INTRINSIC_DATA(xop_vpcomuw,       INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
  X86_INTRINSIC_DATA(xop_vpcomw,        INTR_TYPE_3OP, X86ISD::VPCOM, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps,     INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
+  X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
  X86_INTRINSIC_DATA(xop_vpperm,        INTR_TYPE_3OP, X86ISD::VPPERM, 0),
  X86_INTRINSIC_DATA(xop_vprotb,        INTR_TYPE_2OP, X86ISD::VPROT, 0),
  X86_INTRINSIC_DATA(xop_vprotbi,       INTR_TYPE_2OP, X86ISD::VPROTI, 0),
--- a/test/CodeGen/X86/stack-folding-xop.ll
+++ b/test/CodeGen/X86/stack-folding-xop.ll
@ -166,69 +166,69 @@ define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
 }
 declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone

-define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2pd_rm
  ;CHECK:       vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 0)
+  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
  ret <2 x double> %2
 }
-define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2pd_mr
  ;CHECK:       vpermil2pd $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x double> %a1, i8 0)
+  %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0)
  ret <2 x double> %2
 }
-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone

-define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2pd_rm
  ;CHECK:       vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 0)
+  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
  ret <4 x double> %2
 }
-define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2pd_mr
  ;CHECK:       vpermil2pd $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x double> %a1, i8 0)
+  %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0)
  ret <4 x double> %2
 }
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone

-define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2ps_rm
  ;CHECK:       vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 0)
+  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0)
  ret <4 x float> %2
 }
-define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2ps_mr
  ;CHECK:       vpermil2ps $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x float> %a1, i8 0)
+  %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0)
  ret <4 x float> %2
 }
-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone

-define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2ps_rm
  ;CHECK:       vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 0)
+  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0)
  ret <8 x float> %2
 }
-define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) {
  ;CHECK-LABEL: stack_fold_vpermil2ps_mr
  ;CHECK:       vpermil2ps $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x float> %a1, i8 0)
+  %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0)
  ret <8 x float> %2
 }
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone

 define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) {
  ;CHECK-LABEL: stack_fold_vphaddbd
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@ -2,63 +2,60 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s

-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone

-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone

 declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone

 define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: combine_vpermil2pd_identity:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:    vmovq %rax, %xmm2
 ; CHECK-NEXT:    vpermil2pd $0, %xmm2, %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vpermil2pd $0, %xmm2, %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %mask = bitcast <2 x i64> <i64 2, i64 0> to <2 x double>
-  %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x double> %mask, i8 0)
-  %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x double> %mask, i8 0)
+  %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> <i64 2, i64 0>, i8 0)
+  %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> <i64 2, i64 0>, i8 0)
  ret <2 x double> %res1
 }

 define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: combine_vpermil2pd256_identity:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [9.881313e-324,0.000000e+00,9.881313e-324,0.000000e+00]
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [2,0,2,0]
 ; CHECK-NEXT:    vpermil2pd $0, %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vpermil2pd $0, %ymm2, %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %mask = bitcast <4 x i64> <i64 2, i64 0, i64 2, i64 0> to <4 x double>
-  %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x double> %mask, i8 0)
-  %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x double> %mask, i8 0)
+  %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
+  %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
  ret <4 x double> %res1
 }

 define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: combine_vpermil2ps_identity:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [4.203895e-45,2.802597e-45,1.401298e-45,0.000000e+00]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,2,1,0]
 ; CHECK-NEXT:    vpermil2ps $0, %xmm2, %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vpermil2ps $0, %xmm2, %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %mask = bitcast <4 x i32> <i32 3, i32 2, i32 1, i32 0> to <4 x float>
-  %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x float> %mask, i8 0)
-  %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x float> %mask, i8 0)
+  %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
+  %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
  ret <4 x float> %res1
 }

 define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: combine_vpermil2ps256_identity:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4.203895e-45,2.802597e-45,1.401298e-45,0.000000e+00,1.401298e-45,0.000000e+00,4.203895e-45,2.802597e-45]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,2,1,0,1,0,3,2]
 ; CHECK-NEXT:    vpermil2ps $0, %ymm2, %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vpermil2ps $0, %ymm2, %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %mask = bitcast <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2> to <8 x float>
-  %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x float> %mask, i8 0)
-  %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x float> %mask, i8 0)
+  %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
+  %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
  ret <8 x float> %res1
 }

@ -67,8 +64,7 @@ define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x flo
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2ps $2, {{.*}}(%rip), %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %mask = bitcast <4 x i32> <i32 8, i32 1, i32 2, i32 3> to <4 x float>
-  %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %mask, i8 2)
+  %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
  ret <4 x float> %res0
 }

--- a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
@ -1,6 +1,82 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s

+define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ;  [#uses=1]
+  ret <2 x double> %res
+}
+define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %vec = load <2 x double>, <2 x double>* %a1
+  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ;  [#uses=1]
+  ret <2 x double> %res
+}
+define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %vec = load <2 x double>, <2 x double>* %a2
+  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ;  [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
+  ret <4 x double> %res
+}
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %vec = load <4 x double>, <4 x double>* %a1
+  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
+  ret <4 x double> %res
+}
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %vec = load <4 x double>, <4 x double>* %a2
+  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
 define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_int_x86_xop_vpcomeqb:
 ; CHECK:       # BB#0:
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@ -1,81 +1,81 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s

-define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ;  [#uses=1]
+  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 1) ;  [#uses=1]
  ret <2 x double> %res
 }
-define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
+define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
  %vec = load <2 x double>, <2 x double>* %a1
-  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ;  [#uses=1]
+  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x i64> %a2, i8 1) ;  [#uses=1]
  ret <2 x double> %res
 }
-define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
+define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64>* %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %vec = load <2 x double>, <2 x double>* %a2
-  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ;  [#uses=1]
+  %vec = load <2 x i64>, <2 x i64>* %a2
+  %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %vec, i8 1) ;  [#uses=1]
  ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone

-define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
+  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 2) ;
  ret <4 x double> %res
 }
-define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
  %vec = load <4 x double>, <4 x double>* %a1
-  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
+  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x i64> %a2, i8 2) ;
  ret <4 x double> %res
 }
-define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x i64>* %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %vec = load <4 x double>, <4 x double>* %a2
-  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
+  %vec = load <4 x i64>, <4 x i64>* %a2
+  %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %vec, i8 2) ;
  ret <4 x double> %res
 }
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone

-define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
+  %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 3) ;
  ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone

-define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
+  %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 4) ;
  ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone

 define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov: