diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index bdd27df5a3a..4b6550bddd2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1875,21 +1875,6 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, // SSE 1 & 2 - Non-temporal stores //===----------------------------------------------------------------------===// -def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX; -def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX; - -let ExeDomain = SSEPackedInt in - def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX; - let AddedComplexity = 400 in { // Prefer non-temporal versions def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), @@ -1906,12 +1891,16 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>, VEX; + let ExeDomain = SSEPackedInt in - def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; + def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + + def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), @@ -1943,18 +1932,6 @@ def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), (VMOVNTPSYmr addr:$dst, VR256:$src)>; -def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; -def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; - -let ExeDomain = SSEPackedInt in -def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; - let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -1972,22 +1949,19 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - } -def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "movnti\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, - TB, Requires<[HasSSE2]>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc Instructions (No AVX form) diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index 4541f381ed4..79ab90c910a 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -533,6 +533,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { // Calls to these instructions are transformed into unaligned loads. NewFn = 0; return true; + } else if (Name.compare(5, 16, "x86.sse.movnt.ps", 16) == 0 || + Name.compare(5, 17, "x86.sse2.movnt.dq", 17) == 0 || + Name.compare(5, 17, "x86.sse2.movnt.pd", 17) == 0 || + Name.compare(5, 17, "x86.sse2.movnt.i", 16) == 0) { + // Calls to these instructions are transformed into nontemporal stores. + NewFn = 0; + return true; } else if (Name.compare(5, 17, "x86.ssse3.pshuf.w", 17) == 0) { // This is an SSE/MMX instruction. const Type *X86_MMXTy = VectorType::getX86_MMXTy(FTy->getContext()); @@ -973,6 +980,31 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (!CI->use_empty()) CI->replaceAllUsesWith(BC); + // Remove intrinsic. + CI->eraseFromParent(); + } else if (F->getName() == "llvm.x86.sse.movnt.ps" || + F->getName() == "llvm.x86.sse2.movnt.dq" || + F->getName() == "llvm.x86.sse2.movnt.pd" || + F->getName() == "llvm.x86.sse2.movnt.i") { + IRBuilder<> Builder(C); + Builder.SetInsertPoint(CI->getParent(), CI); + + Module *M = F->getParent(); + SmallVector Elts; + Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1)); + MDNode *Node = MDNode::get(C, Elts); + + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + + // Convert the type of the pointer to a pointer to the stored type. + Value *BC = Builder.CreateBitCast(Arg0, + PointerType::getUnqual(Arg1->getType()), + "cast"); + StoreInst *SI = Builder.CreateStore(Arg1, BC); + SI->setMetadata(M->getMDKindID("nontemporal"), Node); + SI->setAlignment(16); + // Remove intrinsic. CI->eraseFromParent(); } else { diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll index e4e2d3a56e0..417493f7168 100644 --- a/test/Assembler/AutoUpgradeIntrinsics.ll +++ b/test/Assembler/AutoUpgradeIntrinsics.ll @@ -10,6 +10,7 @@ ; RUN: not grep {llvm\\.x86\\.sse2\\.loadu} ; RUN: llvm-as < %s | llvm-dis | \ ; RUN: grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16 +; RUN: llvm-as < %s | llvm-dis | FileCheck %s declare i32 @llvm.ctpop.i28(i28 %val) declare i32 @llvm.cttz.i29(i29 %val) @@ -91,3 +92,20 @@ define void @test_loadu(i8* %a, double* %b) { %v2 = call <2 x double> @llvm.x86.sse2.loadu.pd(double* %b) ret void } + +declare void @llvm.x86.sse.movnt.ps(i8*, <4 x float>) nounwind readnone +declare void @llvm.x86.sse2.movnt.dq(i8*, <2 x double>) nounwind readnone +declare void @llvm.x86.sse2.movnt.pd(i8*, <2 x double>) nounwind readnone +declare void @llvm.x86.sse2.movnt.i(i8*, i32) nounwind readnone + +define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D) { +; CHECK: store{{.*}}nontemporal + call void @llvm.x86.sse.movnt.ps(i8* %B, <4 x float> %A) +; CHECK: store{{.*}}nontemporal + call void @llvm.x86.sse2.movnt.dq(i8* %B, <2 x double> %C) +; CHECK: store{{.*}}nontemporal + call void @llvm.x86.sse2.movnt.pd(i8* %B, <2 x double> %C) +; CHECK: store{{.*}}nontemporal + call void @llvm.x86.sse2.movnt.i(i8* %B, i32 %D) + ret void +} diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll new file mode 100644 index 00000000000..1d095359b61 --- /dev/null +++ b/test/CodeGen/X86/nontemporal.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s + +define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) { +; CHECK: movntps + %cast = bitcast i8* %B to <4 x float>* + store <4 x float> %A, <4 x float>* %cast, align 16, !nontemporal !0 +; CHECK: movntdq + %cast1 = bitcast i8* %B to <2 x i64>* + store <2 x i64> %E, <2 x i64>* %cast1, align 16, !nontemporal !0 +; CHECK: movntpd + %cast2 = bitcast i8* %B to <2 x double>* + store <2 x double> %C, <2 x double>* %cast2, align 16, !nontemporal !0 +; CHECK: movnti + %cast3 = bitcast i8* %B to i32* + store i32 %D, i32* %cast3, align 16, !nontemporal !0 + ret void +} + +!0 = metadata !{i32 1}