From b8bd41122a5f8f474af6e6acdffb0beaa5bc68db Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Fri, 4 Dec 2015 23:00:33 +0000 Subject: [PATCH] X86: Don't emit SAHF/LAHF for 64-bit targets unless explicitly supported These instructions are not supported by all CPUs in 64-bit mode. Emitting them causes Chromium to crash on start-up for users with such chips. (GCC puts these instructions behind -msahf on 64-bit for the same reason.) This patch adds FeatureLAHFSAHF, enables it by default for 32-bit targets and modern CPUs, and changes X86InstrInfo::copyPhysReg back to the lowering from before r244503 when the instructions are not available. Differential Revision: http://reviews.llvm.org/D15240 llvm-svn: 254793 --- lib/Target/X86/X86.td | 58 +++++++++----- lib/Target/X86/X86ISelLowering.cpp | 3 + lib/Target/X86/X86InstrInfo.cpp | 29 ++++++- lib/Target/X86/X86InstrInfo.td | 7 +- lib/Target/X86/X86Subtarget.cpp | 10 +++ lib/Target/X86/X86Subtarget.h | 4 + test/CodeGen/X86/cmpxchg-clobber-flags.ll | 76 +++++++++++++------ .../X86/peephole-na-phys-copy-folding.ll | 2 +- 8 files changed, 138 insertions(+), 51 deletions(-) diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 7d9f396c1e9..dc5ab1bf65d 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -182,6 +182,8 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", "Support MPX instructions">; def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", @@ -273,7 +275,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ FeatureSSSE3, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem + FeatureSlowBTMem, + FeatureLAHFSAHF ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureSlowUAMem16, @@ -281,7 +284,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureSSE41, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem + FeatureSlowBTMem, + FeatureLAHFSAHF ]>; // Atom CPUs. @@ -299,7 +303,8 @@ class BonnellProc : ProcessorModel; def : BonnellProc<"bonnell">; def : BonnellProc<"atom">; // Pin the generic name to the baseline. @@ -319,7 +324,8 @@ class SilvermontProc : ProcessorModel; def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. @@ -331,7 +337,8 @@ class NehalemProc : ProcessorModel; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; @@ -346,7 +353,8 @@ class WestmereProc : ProcessorModel; def : WestmereProc<"westmere">; @@ -363,7 +371,8 @@ class SandyBridgeProc : ProcessorModel; def : SandyBridgeProc<"sandybridge">; def : SandyBridgeProc<"corei7-avx">; // Legacy alias. @@ -382,7 +391,8 @@ class IvyBridgeProc : ProcessorModel; def : IvyBridgeProc<"ivybridge">; def : IvyBridgeProc<"core-avx-i">; // Legacy alias. @@ -408,7 +418,8 @@ class HaswellProc : ProcessorModel; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. @@ -436,7 +447,8 @@ class BroadwellProc : ProcessorModel; def : BroadwellProc<"broadwell">; @@ -465,7 +477,8 @@ class KnightsLandingProc : ProcessorModel; def : KnightsLandingProc<"knl">; @@ -500,7 +513,8 @@ class SkylakeProc : ProcessorModel; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. @@ -547,7 +561,7 @@ def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowBTMem, FeatureSlowSHLD]>; + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; // Bobcat def : Proc<"btver1", [ @@ -560,7 +574,8 @@ def : Proc<"btver1", [ FeatureLZCNT, FeaturePOPCNT, FeatureXSAVE, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureLAHFSAHF ]>; // Jaguar @@ -580,7 +595,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [ FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureLAHFSAHF ]>; // Bulldozer @@ -598,7 +614,8 @@ def : Proc<"bdver1", [ FeatureLZCNT, FeaturePOPCNT, FeatureXSAVE, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureLAHFSAHF ]>; // Piledriver def : Proc<"bdver2", [ @@ -619,7 +636,8 @@ def : Proc<"bdver2", [ FeatureBMI, FeatureTBM, FeatureFMA, - FeatureSlowSHLD + FeatureSlowSHLD, + FeatureLAHFSAHF ]>; // Steamroller @@ -643,7 +661,8 @@ def : Proc<"bdver3", [ FeatureFMA, FeatureXSAVEOPT, FeatureSlowSHLD, - FeatureFSGSBase + FeatureFSGSBase, + FeatureLAHFSAHF ]>; // Excavator @@ -666,7 +685,8 @@ def : Proc<"bdver4", [ FeatureTBM, FeatureFMA, FeatureXSAVEOPT, - FeatureFSGSBase + FeatureFSGSBase, + FeatureLAHFSAHF ]>; def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2cf1d4ba30e..c07bca8fe52 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13930,6 +13930,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index e9d36f8ce2f..ebe329064c5 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -4385,7 +4385,32 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, int Reg = FromEFLAGS ? DestReg : SrcReg; bool is32 = X86::GR32RegClass.contains(Reg); bool is64 = X86::GR64RegClass.contains(Reg); + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(is64 && "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - clobbersTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } + return; + } + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is // inefficient. Instead: // - Save the overflow flag OF into AL using SETO, and restore it using a @@ -4407,10 +4432,6 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Notice that we have to adjust the stack if we don't want to clobber the // first frame index. See X86FrameLowering.cpp - clobbersTheStack. - int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; - int Push = is64 ? X86::PUSH64r : X86::PUSH32r; - int Pop = is64 ? X86::POP64r : X86::POP32r; - int AX = is64 ? X86::RAX : X86::EAX; bool AXDead = (Reg == AX); // FIXME: The above could figure out that AX is dead in more cases with: diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 1e66739026e..1c21a098bc6 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -799,6 +799,7 @@ def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; @@ -1502,10 +1503,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem, let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", - [(set EFLAGS, (X86sahf AH))], IIC_AHF>; + [(set EFLAGS, (X86sahf AH))], IIC_AHF>, + Requires<[HasLAHFSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], - IIC_AHF>; // AH = flags + IIC_AHF>, // AH = flags + Requires<[HasLAHFSAHF]>; } // SchedRW //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 44a46b7e07a..f90a0b0d04f 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -189,6 +189,15 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { FullFS = "+64bit,+sse2"; } + // LAHF/SAHF are always supported in non-64-bit mode. + if (!In64BitMode) { + if (!FullFS.empty()) + FullFS = "+sahf," + FullFS; + else + FullFS = "+sahf"; + } + + // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); @@ -264,6 +273,7 @@ void X86Subtarget::initializeEnvironment() { HasSHA = false; HasPRFCHW = false; HasRDSEED = false; + HasLAHFSAHF = false; HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 353b4f7f5eb..83bc640976a 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -152,6 +152,9 @@ protected: /// Processor has RDSEED instructions. bool HasRDSEED; + /// Processor has LAHF/SAHF instructions. + bool HasLAHFSAHF; + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -374,6 +377,7 @@ public: bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll index 791edba89c4..c294dee4013 100644 --- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll +++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll @@ -1,7 +1,11 @@ ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -check-prefix=i386 ; RUN: llc -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=i386f + ; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s -check-prefix=x8664 ; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664 +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s -check-prefix=x8664-sahf +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s -check-prefix=x8664-sahf +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7 %s -o - | FileCheck %s -check-prefix=x8664-sahf ; FIXME: X86InstrInfo::copyPhysReg had code which figured out whether AX was ; live or not to avoid save / restore when it's not needed. See FIXME in @@ -56,22 +60,32 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) { ; x8664-LABEL: test_intervening_call: ; x8664: cmpxchgq -; x8664: pushq %rax -; x8664-NEXT: seto %al -; x8664-NEXT: lahf -; x8664-NEXT: movq %rax, [[FLAGS:%.*]] -; x8664-NEXT: popq %rax +; x8664: pushfq +; x8664-NEXT: popq [[FLAGS:%.*]] ; x8664-NEXT: movq %rax, %rdi ; x8664-NEXT: callq bar -; ** FIXME Next line isn't actually necessary. ** -; x8664-NEXT: pushq %rax -; x8664-NEXT: movq [[FLAGS]], %rax -; x8664-NEXT: addb $127, %al -; x8664-NEXT: sahf -; ** FIXME Next line isn't actually necessary. ** -; x8664-NEXT: popq %rax +; x8664-NEXT: pushq [[FLAGS]] +; x8664-NEXT: popfq ; x8664-NEXT: jne +; x8664-sahf-LABEL: test_intervening_call: +; x8664-sahf: cmpxchgq +; x8664-sahf: pushq %rax +; x8664-sahf-NEXT: seto %al +; x8664-sahf-NEXT: lahf +; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]] +; x8664-sahf-NEXT: popq %rax +; x8664-sahf-NEXT: movq %rax, %rdi +; x8664-sahf-NEXT: callq bar +; ** FIXME Next line isn't actually necessary. ** +; x8664-sahf-NEXT: pushq %rax +; x8664-sahf-NEXT: movq [[FLAGS]], %rax +; x8664-sahf-NEXT: addb $127, %al +; x8664-sahf-NEXT: sahf +; ** FIXME Next line isn't actually necessary. ** +; x8664-sahf-NEXT: popq %rax +; x8664-sahf-NEXT: jne + %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst %v = extractvalue { i64, i1 } %cx, 0 %p = extractvalue { i64, i1 } %cx, 1 @@ -99,6 +113,10 @@ define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) { ; x8664: cmpxchg ; x8664-NEXT: jne +; x8664-sahf-LABEL: test_control_flow: +; x8664-sahf: cmpxchg +; x8664-sahf-NEXT: jne + entry: %cmp = icmp sgt i32 %i, %j br i1 %cmp, label %loop_start, label %cond.end @@ -165,20 +183,28 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) { ; i386f-NEXT: popl %eax ; x8664-LABEL: test_feed_cmov: -; x8664: cmpxchgl -; ** FIXME Next line isn't actually necessary. ** -; x8664: pushq %rax -; x8664: seto %al -; x8664-NEXT: lahf -; x8664-NEXT: movq %rax, [[FLAGS:%.*]] -; ** FIXME Next line isn't actually necessary. ** -; x8664-NEXT: popq %rax +; x8664: cmpxchg +; x8664: pushfq +; x8664-NEXT: popq [[FLAGS:%.*]] ; x8664-NEXT: callq foo -; x8664-NEXT: pushq %rax -; x8664-NEXT: movq [[FLAGS]], %rax -; x8664-NEXT: addb $127, %al -; x8664-NEXT: sahf -; x8664-NEXT: popq %rax +; x8664-NEXT: pushq [[FLAGS]] +; x8664-NEXT: popfq + +; x8664-sahf-LABEL: test_feed_cmov: +; x8664-sahf: cmpxchgl +; ** FIXME Next line isn't actually necessary. ** +; x8664-sahf: pushq %rax +; x8664-sahf: seto %al +; x8664-sahf-NEXT: lahf +; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]] +; ** FIXME Next line isn't actually necessary. ** +; x8664-sahf-NEXT: popq %rax +; x8664-sahf-NEXT: callq foo +; x8664-sahf-NEXT: pushq %rax +; x8664-sahf-NEXT: movq [[FLAGS]], %rax +; x8664-sahf-NEXT: addb $127, %al +; x8664-sahf-NEXT: sahf +; x8664-sahf-NEXT: popq %rax %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst %success = extractvalue { i32, i1 } %res, 1 diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll index 891a925611c..a8df33454e9 100644 --- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s -; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s ; FIXME Add -verify-machineinstrs back when PR24535 is fixed.