Add 3DNow! intrinsics.

llvm-svn: 129551
2024-11-24 11:42:57 +01:00 · 2011-04-15 00:32:41 +00:00 · 2011-04-15 00:32:41 +00:00 · 05b07faeaf
commit 05b07faeaf
parent 197d67a987
4 changed files with 451 additions and 51 deletions
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@ -17,6 +17,83 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_int : Intrinsic<[], [llvm_i8_ty]>;
 }

+//===----------------------------------------------------------------------===//
+// 3DNow!
+
+let TargetPrefix = "x86" in {
+  def int_x86_3dnow_pavgusb : GCCBuiltin<"__builtin_ia32_pavgusb">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pf2id : GCCBuiltin<"__builtin_ia32_pf2id">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pfacc : GCCBuiltin<"__builtin_ia32_pfacc">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfadd : GCCBuiltin<"__builtin_ia32_pfadd">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfcmpeq : GCCBuiltin<"__builtin_ia32_pfcmpeq">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfcmpge : GCCBuiltin<"__builtin_ia32_pfcmpge">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfcmpgt : GCCBuiltin<"__builtin_ia32_pfcmpgt">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfmax : GCCBuiltin<"__builtin_ia32_pfmax">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfmin : GCCBuiltin<"__builtin_ia32_pfmin">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfmul : GCCBuiltin<"__builtin_ia32_pfmul">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfrcp : GCCBuiltin<"__builtin_ia32_pfrcp">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pfrcpit1 : GCCBuiltin<"__builtin_ia32_pfrcpit1">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfrcpit2 : GCCBuiltin<"__builtin_ia32_pfrcpit2">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfrsqrt : GCCBuiltin<"__builtin_ia32_pfrsqrt">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pfrsqit1 : GCCBuiltin<"__builtin_ia32_pfrsqit1">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfsub : GCCBuiltin<"__builtin_ia32_pfsub">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pfsubr : GCCBuiltin<"__builtin_ia32_pfsubr">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnow_pi2fd : GCCBuiltin<"__builtin_ia32_pi2fd">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnow_pmulhrw : GCCBuiltin<"__builtin_ia32_pmulhrw">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// 3DNow! extensions
+
+let TargetPrefix = "x86" in {
+  def int_x86_3dnowa_pf2iw : GCCBuiltin<"__builtin_ia32_pf2iw">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnowa_pfnacc : GCCBuiltin<"__builtin_ia32_pfnacc">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnowa_pfpnacc : GCCBuiltin<"__builtin_ia32_pfpnacc">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
+                        [IntrNoMem]>;
+  def int_x86_3dnowa_pi2fw : GCCBuiltin<"__builtin_ia32_pi2fw">,
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+  def int_x86_3dnowa_pswapd :
+              Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE1

--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@ -50,7 +50,8 @@ def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
                                      "Enable SSE 4.2 instructions",
                                      [FeatureSSE41, FeaturePOPCNT]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
-                                      "Enable 3DNow! instructions">;
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
 def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
                                      "Enable 3DNow! Athlon instructions",
                                      [Feature3DNow]>;
@ -125,10 +126,10 @@ def : Proc<"sandybridge",     [FeatureSSE42, Feature64Bit,
                               FeatureAES, FeatureCLMUL]>;

 def : Proc<"k6",              [FeatureMMX]>;
-def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
-def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
-def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k6-2",            [Feature3DNow]>;
+def : Proc<"k6-3",            [Feature3DNow]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
 def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
@ -156,8 +157,8 @@ def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
                               Feature3DNowA]>;

 def : Proc<"winchip-c6",      [FeatureMMX]>;
-def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
-def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"winchip2",        [Feature3DNow]>;
+def : Proc<"c3",              [Feature3DNow]>;
 def : Proc<"c3-2",            [FeatureSSE1]>;

 //===----------------------------------------------------------------------===//
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@ -12,50 +12,76 @@
 //
 //===----------------------------------------------------------------------===//

-// FIXME: We don't support any intrinsics for these instructions yet.
-
-class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, 
-             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[Has3DNow]> {
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
 }

-class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic>
-      : I<o, F, (outs VR64:$dst), ins,
-          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), []>,
-          TB, Requires<[Has3DNow]>, Has3DNow0F0FOpcode {
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
  let isAsmParserOnly = 1;
 }

-
-let Constraints = "$src1 = $dst" in {
-  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
-  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
 multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
-    def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn>;
-    def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn>;
-  }
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
 }

-defm PAVGUSB  : I3DNow_binop_rm<0xBF, "pavgusb">;
-defm PF2ID    : I3DNow_binop_rm<0x1D, "pf2id">;
-defm PFACC    : I3DNow_binop_rm<0xAE, "pfacc">;
-defm PFADD    : I3DNow_binop_rm<0x9E, "pfadd">;
-defm PFCMPEQ  : I3DNow_binop_rm<0xB0, "pfcmpeq">;
-defm PFCMPGE  : I3DNow_binop_rm<0x90, "pfcmpge">;
-defm PFCMPGT  : I3DNow_binop_rm<0xA0, "pfcmpgt">;
-defm PFMAX    : I3DNow_binop_rm<0xA4, "pfmax">;
-defm PFMIN    : I3DNow_binop_rm<0x94, "pfmin">;
-defm PFMUL    : I3DNow_binop_rm<0xB4, "pfmul">;
-defm PFRCP    : I3DNow_binop_rm<0x96, "pfrcp">;
-defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1">;
-defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2">;
-defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1">;
-defm PFRSQRT  : I3DNow_binop_rm<0x97, "pfrsqrt">;
-defm PFSUB    : I3DNow_binop_rm<0x9A, "pfsub">;
-defm PFSUBR   : I3DNow_binop_rm<0xAA, "pfsubr">;
-defm PI2FD    : I3DNow_binop_rm<0x0D, "pi2fd">;
-defm PMULHRW  : I3DNow_binop_rm<0xB7, "pmulhrw">;
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;


 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
@ -64,14 +90,13 @@ def PREFETCH  : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr),
                       "prefetch $addr", []>;

 // FIXME: Diassembler gets a bogus decode conflict.
-let isAsmParserOnly = 1 in {
+let isAsmParserOnly = 1 in
 def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr),
                       "prefetchw $addr", []>;
-}

 // "3DNowA" instructions
-defm PF2IW    : I3DNow_binop_rm<0x1C, "pf2iw">;
-defm PI2FW    : I3DNow_binop_rm<0x0C, "pi2fw">;
-defm PFNACC   : I3DNow_binop_rm<0x8A, "pfnacc">;
-defm PFPNACC  : I3DNow_binop_rm<0x8E, "pfpnacc">;
-defm PSWAPD   : I3DNow_binop_rm<0xBB, "pswapd">;
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
--- a/test/CodeGen/X86/3dnow-intrinsics.ll
+++ b/test/CodeGen/X86/3dnow-intrinsics.ll
@ -0,0 +1,297 @@
+; RUN: llc < %s -march=x86 -mattr=+3dnow | FileCheck %s
+
+define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
+; CHECK: pavgusb
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <8 x i8>
+  %1 = bitcast x86_mmx %b.coerce to <8 x i8>
+  %2 = bitcast <8 x i8> %0 to x86_mmx
+  %3 = bitcast <8 x i8> %1 to x86_mmx
+  %4 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %2, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to <8 x i8>
+  ret <8 x i8> %5
+}
+
+declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pf2id(<2 x float> %a) nounwind readnone {
+; CHECK: pf2id
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfacc
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfadd(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfadd
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pfcmpeq(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfcmpeq
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  ret <2 x i32> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pfcmpge(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfcmpge
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  ret <2 x i32> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pfcmpgt(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfcmpgt
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x i32>
+  ret <2 x i32> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfmax(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfmax
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfmin(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfmin
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfmul(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfmul
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrcp(<2 x float> %a) nounwind readnone {
+; CHECK: pfrcp
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrcpit1(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfrcpit1
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrcpit2(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfrcpit2
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrsqrt(<2 x float> %a) nounwind readnone {
+; CHECK: pfrsqrt
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfrsqit1(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfrsqit1
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfsub(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfsub
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfsubr(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfsubr
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone {
+; CHECK: pi2fd
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
+
+define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
+; CHECK: pmulhrw
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <4 x i16>
+  %1 = bitcast x86_mmx %b.coerce to <4 x i16>
+  %2 = bitcast <4 x i16> %0 to x86_mmx
+  %3 = bitcast <4 x i16> %1 to x86_mmx
+  %4 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %2, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+
+declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x i32> @test_pf2iw(<2 x float> %a) nounwind readnone {
+; CHECK: pf2iw
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfnacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfnacc
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pfpnacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
+; CHECK: pfpnacc
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = bitcast <2 x float> %b to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %0, x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone {
+; CHECK: pi2fw
+entry:
+  %0 = bitcast x86_mmx %a.coerce to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to x86_mmx
+  %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %1)
+  %3 = bitcast x86_mmx %2 to <2 x float>
+  ret <2 x float> %3
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
+
+define <2 x float> @test_pswapdsf(<2 x float> %a) nounwind readnone {
+; CHECK: pswapd
+entry:
+  %0 = bitcast <2 x float> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
+; CHECK: pswapd
+entry:
+  %0 = bitcast <2 x i32> %a to x86_mmx
+  %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
+  %2 = bitcast x86_mmx %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone