Added addition atomic instrinsics and, or, xor, min, and max.

llvm-svn: 50663
2024-11-24 03:33:20 +01:00 · 2008-05-05 19:05:59 +00:00 · 2008-05-05 19:05:59 +00:00 · 84a269e023
commit 84a269e023
parent 4a674dc536
15 changed files with 598 additions and 65 deletions
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@ -236,6 +236,11 @@ public:
  ///
  succ_iterator removeSuccessor(succ_iterator I);
  
+  /// transferSuccessors - Transfers all the successors from MBB to this
+  /// machine basic block (i.e., copies all the successors fromMBB and
+  /// remove all the successors fromBB).
+  void transferSuccessors(MachineBasicBlock *fromMBB);
+  
  /// isSuccessor - Return true if the specified MBB is a successor of this
  /// block.
  bool isSuccessor(MachineBasicBlock *MBB) const;
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@ -594,6 +594,24 @@ namespace ISD {
    // the return is always the original value in *ptr
    ATOMIC_SWAP,

+    // Val, OUTCHAIN = ATOMIC_LSS(INCHAIN, ptr, amt)
+    // this corresponds to the atomic.lss intrinsic.
+    // *ptr - amt is stored to *ptr atomically.
+    // the return is always the original value in *ptr
+    ATOMIC_LSS,
+    
+    // Val, OUTCHAIN = ATOMIC_L[OpName]S(INCHAIN, ptr, amt)
+    // this corresponds to the atomic.[OpName] intrinsic.
+    // op(*ptr, amt) is stored to *ptr atomically.
+    // the return is always the original value in *ptr
+    ATOMIC_LOAD_AND,
+    ATOMIC_LOAD_OR,
+    ATOMIC_LOAD_XOR,
+    ATOMIC_LOAD_MIN,
+    ATOMIC_LOAD_MAX,
+    ATOMIC_LOAD_UMIN,
+    ATOMIC_LOAD_UMAX,
+    
    // BUILTIN_OP_END - This must be the last enum value in this list.
    BUILTIN_OP_END
  };
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@ -282,6 +282,46 @@ def int_atomic_swap : Intrinsic<[llvm_anyint_ty,
                                  LLVMMatchType<0>],
                                [IntrWriteArgMem]>,
                       GCCBuiltin<"__sync_lock_test_and_set">;
+def int_atomic_lss   : Intrinsic<[llvm_anyint_ty,
+                                  LLVMPointerType<LLVMMatchType<0>>,
+                                  LLVMMatchType<0>],
+                                 [IntrWriteArgMem]>,
+                       GCCBuiltin<"__sync_fetch_and_sub">;
+def int_atomic_load_and : Intrinsic<[llvm_anyint_ty,
+                                  LLVMPointerType<LLVMMatchType<0>>,
+                                  LLVMMatchType<0>],
+                                 [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_and">;
+def int_atomic_load_or   : Intrinsic<[llvm_anyint_ty,
+                                  LLVMPointerType<LLVMMatchType<0>>,
+                                  LLVMMatchType<0>],
+                                 [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_or">;
+def int_atomic_load_xor  : Intrinsic<[llvm_anyint_ty,
+                                  LLVMPointerType<LLVMMatchType<0>>,
+                                  LLVMMatchType<0>],
+                                 [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_xor">;
+def int_atomic_load_min  : Intrinsic<[llvm_anyint_ty,
+                                   LLVMPointerType<LLVMMatchType<0>>,
+                                   LLVMMatchType<0>],
+                                  [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_min">;
+def int_atomic_load_max  : Intrinsic<[llvm_anyint_ty,
+                                   LLVMPointerType<LLVMMatchType<0>>,
+                                   LLVMMatchType<0>],
+                                  [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_max">;
+def int_atomic_load_umin : Intrinsic<[llvm_anyint_ty,
+                                   LLVMPointerType<LLVMMatchType<0>>,
+                                   LLVMMatchType<0>],
+                                  [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_umin">;
+def int_atomic_load_umax : Intrinsic<[llvm_anyint_ty,
+                                   LLVMPointerType<LLVMMatchType<0>>,
+                                   LLVMMatchType<0>],
+                                  [IntrWriteArgMem]>,
+                           GCCBuiltin<"__sync_fetch_and_umax">;
                                  
 //===-------------------------- Other Intrinsics --------------------------===//
 //
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -1340,12 +1340,15 @@ private:
  /// by the system, this holds the same type (e.g. i32 -> i32).
  MVT::ValueType TransformToType[MVT::LAST_VALUETYPE];

+  // Defines the capacity of the TargetLowering::OpActions table
+  static const int OpActionsCapacity = 173;
+
  /// OpActions - For each operation and each value type, keep a LegalizeAction
  /// that indicates how instruction selection should deal with the operation.
  /// Most operations are Legal (aka, supported natively by the target), but
  /// operations that are not should be described.  Note that operations on
  /// non-legal value types are not described here.
-  uint64_t OpActions[156];
+  uint64_t OpActions[OpActionsCapacity];
  
  /// LoadXActions - For each load of load extension type and each value type,
  /// keep a LegalizeAction that indicates how instruction selection should deal
@ -1378,7 +1381,7 @@ private:
  /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
  /// like PerformDAGCombine callbacks for by calling setTargetDAGCombine(),
  /// which sets a bit in this array.
-  unsigned char TargetDAGCombineArray[160/(sizeof(unsigned char)*8)];
+  unsigned char TargetDAGCombineArray[168/(sizeof(unsigned char)*8)];
  
  /// PromoteToType - For operations that must be promoted to a specific type,
  /// this holds the destination type.  This map should be sparse, so don't hold
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@ -252,6 +252,19 @@ void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
  Predecessors.erase(I);
 }

+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB)
+{
+  if (this == fromMBB)
+    return;
+  
+  for(MachineBasicBlock::succ_iterator iter = fromMBB->succ_begin(), 
+      end = fromMBB->succ_end(); iter != end; ++iter) {
+      addSuccessor(*iter);
+  }
+  while(!fromMBB->succ_empty())
+    fromMBB->removeSuccessor(fromMBB->succ_begin());
+}
+
 bool MachineBasicBlock::isSuccessor(MachineBasicBlock *MBB) const {
  std::vector<MachineBasicBlock *>::const_iterator I =
    std::find(Successors.begin(), Successors.end(), MBB);
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -1235,18 +1235,13 @@ SDOperand SelectionDAGLegalize::LegalizeOp(SDOperand Op) {
    break;
  }

-  case ISD::ATOMIC_LCS:
-  case ISD::ATOMIC_LAS:
-  case ISD::ATOMIC_SWAP: {
-    assert(((Node->getNumOperands() == 4 && Node->getOpcode() == ISD::ATOMIC_LCS) ||
-            (Node->getNumOperands() == 3 && Node->getOpcode() == ISD::ATOMIC_LAS) ||
-            (Node->getNumOperands() == 3 && Node->getOpcode() == ISD::ATOMIC_SWAP)) &&
-           "Invalid Atomic node!");
-    int num = Node->getOpcode() == ISD::ATOMIC_LCS ? 4 : 3;
+  case ISD::ATOMIC_LCS: {
+    unsigned int num_operands = 4;
+    assert(Node->getNumOperands() == num_operands && "Invalid Atomic node!");
    SDOperand Ops[4];
-    for (int x = 0; x < num; ++x)
+    for (unsigned int x = 0; x < num_operands; ++x)
      Ops[x] = LegalizeOp(Node->getOperand(x));
-    Result = DAG.UpdateNodeOperands(Result, &Ops[0], num);
+    Result = DAG.UpdateNodeOperands(Result, &Ops[0], num_operands);
    
    switch (TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0))) {
      default: assert(0 && "This action is not supported yet!");
@ -1260,7 +1255,38 @@ SDOperand SelectionDAGLegalize::LegalizeOp(SDOperand Op) {
    AddLegalizedOperand(SDOperand(Node, 1), Result.getValue(1));
    return Result.getValue(Op.ResNo);
  }      
+  case ISD::ATOMIC_LAS:
+  case ISD::ATOMIC_LSS:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP: {
+    unsigned int num_operands = 3;
+    assert(Node->getNumOperands() == num_operands && "Invalid Atomic node!");
+    SDOperand Ops[3];
+    for (unsigned int x = 0; x < num_operands; ++x)
+      Ops[x] = LegalizeOp(Node->getOperand(x));
+    Result = DAG.UpdateNodeOperands(Result, &Ops[0], num_operands);
    
+    switch (TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0))) {
+    default: assert(0 && "This action is not supported yet!");
+    case TargetLowering::Custom:
+      Result = TLI.LowerOperation(Result, DAG);
+      break;
+    case TargetLowering::Expand:
+      Result = SDOperand(TLI.ExpandOperationResult(Op.Val, DAG),0);
+      break;
+    case TargetLowering::Legal:
+      break;
+    }
+    AddLegalizedOperand(SDOperand(Node, 0), Result.getValue(0));
+    AddLegalizedOperand(SDOperand(Node, 1), Result.getValue(1));
+    return Result.getValue(Op.ResNo);
+  }      
  case ISD::Constant: {
    ConstantSDNode *CN = cast<ConstantSDNode>(Node);
    unsigned opAction =
@ -4242,6 +4268,14 @@ SDOperand SelectionDAGLegalize::PromoteOp(SDOperand Op) {
    break;
  }
  case ISD::ATOMIC_LAS:
+  case ISD::ATOMIC_LSS:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
  case ISD::ATOMIC_SWAP: {
    Tmp2 = PromoteOp(Node->getOperand(2));
    Result = DAG.getAtomic(Node->getOpcode(), Node->getOperand(0), 
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -2855,7 +2855,11 @@ SDOperand SelectionDAG::getAtomic(unsigned Opcode, SDOperand Chain,
 SDOperand SelectionDAG::getAtomic(unsigned Opcode, SDOperand Chain, 
                                  SDOperand Ptr, SDOperand Val, 
                                  MVT::ValueType VT) {
-  assert((Opcode == ISD::ATOMIC_LAS || Opcode == ISD::ATOMIC_SWAP)
+  assert((   Opcode == ISD::ATOMIC_LAS || Opcode == ISD::ATOMIC_LSS
+          || Opcode == ISD::ATOMIC_SWAP || Opcode == ISD::ATOMIC_LOAD_AND
+          || Opcode == ISD::ATOMIC_LOAD_OR || Opcode == ISD::ATOMIC_LOAD_XOR
+          || Opcode == ISD::ATOMIC_LOAD_MIN || Opcode == ISD::ATOMIC_LOAD_MAX
+          || Opcode == ISD::ATOMIC_LOAD_UMIN || Opcode == ISD::ATOMIC_LOAD_UMAX) 
         && "Invalid Atomic Op");
  SDVTList VTs = getVTList(Val.getValueType(), MVT::Other);
  FoldingSetNodeID ID;
@ -4269,6 +4273,14 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::MEMBARRIER:    return "MemBarrier";
  case ISD::ATOMIC_LCS:    return "AtomicLCS";
  case ISD::ATOMIC_LAS:    return "AtomicLAS";
+  case ISD::ATOMIC_LSS:    return "AtomicLSS";
+  case ISD::ATOMIC_LOAD_AND:  return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:   return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:  return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_MIN:  return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:  return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN: return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX: return "AtomicLoadUMax";
  case ISD::ATOMIC_SWAP:   return "AtomicSWAP";
  case ISD::PCMARKER:      return "PCMarker";
  case ISD::READCYCLECOUNTER: return "ReadCycleCounter";
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@ -732,6 +732,10 @@ public:
    assert(0 && "UserOp2 should not exist at instruction selection time!");
    abort();
  }
+  
+private:
+  inline const char *implVisitBinaryAtomic(CallInst& I, ISD::NodeType Op);
+
 };
 } // end namespace llvm

@ -2769,6 +2773,22 @@ static void addCatchInfo(CallInst &I, MachineModuleInfo *MMI,
  }
 }

+
+/// Inlined utility function to implement binary input atomic intrinsics for 
+// visitIntrinsicCall: I is a call instruction
+//                     Op is the associated NodeType for I
+const char *
+SelectionDAGLowering::implVisitBinaryAtomic(CallInst& I, ISD::NodeType Op) {
+  SDOperand Root = getRoot();   
+  SDOperand O2 = getValue(I.getOperand(2));
+  SDOperand L = DAG.getAtomic(Op, Root, 
+                              getValue(I.getOperand(1)), 
+                              O2, O2.getValueType());
+  setValue(&I, L);
+  DAG.setRoot(L.getValue(1));
+  return 0;
+}
+
 /// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
 /// we want to emit this as a call to a named external function, return the name
 /// otherwise lower it and return null.
@ -3205,27 +3225,26 @@ SelectionDAGLowering::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
    DAG.setRoot(L.getValue(1));
    return 0;
  }
-  case Intrinsic::atomic_las: {
-    SDOperand Root = getRoot();   
-    SDOperand O2 = getValue(I.getOperand(2));
-    SDOperand L = DAG.getAtomic(ISD::ATOMIC_LAS, Root, 
-                                getValue(I.getOperand(1)), 
-                                O2, O2.getValueType());
-    setValue(&I, L);
-    DAG.setRoot(L.getValue(1));
-    return 0;
-  }
-  case Intrinsic::atomic_swap: {
-    SDOperand Root = getRoot();   
-    SDOperand O2 = getValue(I.getOperand(2));
-    SDOperand L = DAG.getAtomic(ISD::ATOMIC_SWAP, Root, 
-                                getValue(I.getOperand(1)), 
-                                O2, O2.getValueType());
-    setValue(&I, L);
-    DAG.setRoot(L.getValue(1));
-    return 0;
-  }
-
+  case Intrinsic::atomic_las:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LAS);
+  case Intrinsic::atomic_lss:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LSS);
+  case Intrinsic::atomic_load_and:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_AND);
+  case Intrinsic::atomic_load_or:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_OR);
+  case Intrinsic::atomic_load_xor:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_XOR);
+  case Intrinsic::atomic_load_min:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MIN);
+  case Intrinsic::atomic_load_max:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MAX);
+  case Intrinsic::atomic_load_umin:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMIN);
+  case Intrinsic::atomic_load_umax:
+      return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMAX);                                              
+  case Intrinsic::atomic_swap:
+    return implVisitBinaryAtomic(I, ISD::ATOMIC_SWAP);
  }
 }

@ -4519,8 +4538,6 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.setPreservesAll();
 }

-
-
 bool SelectionDAGISel::runOnFunction(Function &Fn) {
  // Get alias analysis for load/store combining.
  AA = &getAnalysis<AliasAnalysis>();
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -165,7 +165,7 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {

 TargetLowering::TargetLowering(TargetMachine &tm)
  : TM(tm), TD(TM.getTargetData()) {
-  assert(ISD::BUILTIN_OP_END <= 156 &&
+  assert(ISD::BUILTIN_OP_END <= OpActionsCapacity &&
         "Fixed size array in TargetLowering is not large enough!");
  // All operations default to being supported.
  memset(OpActions, 0, sizeof(OpActions));
--- a/lib/Target/TargetSelectionDAG.td
+++ b/lib/Target/TargetSelectionDAG.td
@ -358,6 +358,22 @@ def atomic_las  : SDNode<"ISD::ATOMIC_LAS" , STDAtomic2,
                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def atomic_swap : SDNode<"ISD::ATOMIC_SWAP", STDAtomic2,
                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_lss  : SDNode<"ISD::ATOMIC_LSS" , STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_and : SDNode<"ISD::ATOMIC_LOAD_AND" , STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_or  : SDNode<"ISD::ATOMIC_LOAD_OR" , STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_xor : SDNode<"ISD::ATOMIC_LOAD_XOR" , STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_min : SDNode<"ISD::ATOMIC_LOAD_MIN", STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_max : SDNode<"ISD::ATOMIC_LOAD_MAX", STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_umin : SDNode<"ISD::ATOMIC_LOAD_UMIN", STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def atomic_load_umax : SDNode<"ISD::ATOMIC_LOAD_UMAX", STDAtomic2,
+                         [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;

 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -292,10 +292,12 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
  if (!Subtarget->hasSSE2())
    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);

+  // Expand certain atomics
  setOperationAction(ISD::ATOMIC_LCS     , MVT::i8, Custom);
  setOperationAction(ISD::ATOMIC_LCS     , MVT::i16, Custom);
  setOperationAction(ISD::ATOMIC_LCS     , MVT::i32, Custom);
  setOperationAction(ISD::ATOMIC_LCS     , MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_LSS     , MVT::i32, Expand);

  // Use the default ISD::LOCATION, ISD::DECLARE expansion.
  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
@ -5511,6 +5513,15 @@ SDNode* X86TargetLowering::ExpandATOMIC_LCS(SDNode* Op, SelectionDAG &DAG) {
  return DAG.getNode(ISD::MERGE_VALUES, Tys, ResultVal, cpOutH.getValue(1)).Val;
 }

+SDNode* X86TargetLowering::ExpandATOMIC_LSS(SDNode* Op, SelectionDAG &DAG) {
+  MVT::ValueType T = cast<AtomicSDNode>(Op)->getVT();
+  assert (T == MVT::i32 && "Only know how to expand i32 LSS");
+  SDOperand negOp = DAG.getNode(ISD::SUB, T,
+                                DAG.getConstant(0, T), Op->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LAS, Op->getOperand(0),
+                       Op->getOperand(1), negOp, T).Val;
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
@ -5568,6 +5579,7 @@ SDNode *X86TargetLowering::ExpandOperationResult(SDNode *N, SelectionDAG &DAG) {
  case ISD::FP_TO_SINT:         return ExpandFP_TO_SINT(N, DAG);
  case ISD::READCYCLECOUNTER:   return ExpandREADCYCLECOUNTER(N, DAG);
  case ISD::ATOMIC_LCS:         return ExpandATOMIC_LCS(N, DAG);
+  case ISD::ATOMIC_LSS:         return ExpandATOMIC_LSS(N,DAG);
  }
 }

@ -5732,6 +5744,187 @@ X86TargetLowering::isVectorClearMaskLegal(const std::vector<SDOperand> &BVOps,
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//

+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpc,
+                                                       unsigned immOpc) {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld  EAX = [bitinstr.addr]
+  //     mov t1 = EAX
+  //     op  t2 = t1, [bitinstr.val] 
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz  newMBB
+  //     fallthrough -->nextMBB
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  ilist<MachineBasicBlock>::iterator MBBIter = MBB;
+  ++MBBIter;
+  
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = new MachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = new MachineBasicBlock(LLVM_BB);
+  F->getBasicBlockList().insert(MBBIter, newMBB);
+  F->getBasicBlockList().insert(MBBIter, nextMBB);
+  
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+    
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+  
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+  
+  // Insert instructions into newMBB based on incoming instruction
+  assert(bInstr->getNumOperands() < 8 && "unexpected number of operands");
+  MachineOperand& destOper = bInstr->getOperand(0);
+  MachineOperand* argOpers[6];
+  int numArgs = bInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &bInstr->getOperand(i+1);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = 3; // [0,3]
+  int valArgIndx = 4;
+  
+  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), X86::EAX);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  
+  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t1);
+  MIB.addReg(X86::EAX);
+  
+  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
+         && "invalid operand");
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, TII->get(regOpc), t2);
+  else
+    MIB = BuildMI(newMBB, TII->get(immOpc), t2);
+  MIB.addReg(t1);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t2);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB.addReg(X86::EAX);
+  
+  // insert branch
+  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
+
+  delete bInstr;   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
+                                                      MachineBasicBlock *MBB,
+                                                      unsigned cmovOpc) {
+  // For the atomic min/max operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld EAX = [min/max.addr]
+  //     mov t1 = EAX
+  //     mov t2 = [min/max.val] 
+  //     cmp  t1, t2
+  //     cmov[cond] t2 = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz   newMBB
+  //     fallthrough -->nextMBB
+  //
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  ilist<MachineBasicBlock>::iterator MBBIter = MBB;
+  ++MBBIter;
+  
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = new MachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = new MachineBasicBlock(LLVM_BB);
+  F->getBasicBlockList().insert(MBBIter, newMBB);
+  F->getBasicBlockList().insert(MBBIter, nextMBB);
+  
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+  
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+  
+  // newMBB jumps to newMBB and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+  
+  // Insert instructions into newMBB based on incoming instruction
+  assert(mInstr->getNumOperands() < 8 && "unexpected number of operands");
+  MachineOperand& destOper = mInstr->getOperand(0);
+  MachineOperand* argOpers[6];
+  int numArgs = mInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &mInstr->getOperand(i+1);
+  
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = 3; // [0,3]
+  int valArgIndx = 4;
+  
+  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), X86::EAX);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  
+  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t1);
+  MIB.addReg(X86::EAX);
+  
+  // We only support register and immediate values
+  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
+         && "invalid operand");
+  
+  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);  
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
+  else 
+    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, TII->get(X86::CMP32rr));
+  MIB.addReg(t1);
+  MIB.addReg(t2);
+
+  // Generate movc
+  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MIB = BuildMI(newMBB, TII->get(cmovOpc),t3);
+  MIB.addReg(t2);
+  MIB.addReg(t1);
+
+  // Cmp and exchange if none has modified the memory location
+  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t3);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB.addReg(X86::EAX);
+  
+  // insert branch
+  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
+
+  delete mInstr;   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                               MachineBasicBlock *BB) {
@ -5766,15 +5959,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    MachineFunction *F = BB->getParent();
    F->getBasicBlockList().insert(It, copy0MBB);
    F->getBasicBlockList().insert(It, sinkMBB);
-    // Update machine-CFG edges by first adding all successors of the current
+    // Update machine-CFG edges by transferring all successors of the current
    // block to the new block which will contain the Phi node for the select.
-    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
-        e = BB->succ_end(); i != e; ++i)
-      sinkMBB->addSuccessor(*i);
-    // Next, remove all successors of the current block, and add the true
-    // and fallthrough blocks as its successors.
-    while(!BB->succ_empty())
-      BB->removeSuccessor(BB->succ_begin());
+    sinkMBB->transferSuccessors(BB);
+
+    // Add the true and fallthrough blocks as its successors.
    BB->addSuccessor(copy0MBB);
    BB->addSuccessor(sinkMBB);

@ -5874,6 +6063,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    delete MI;   // The pseudo instruction is gone now.
    return BB;
  }
+  case X86::ATOMAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                                       X86::AND32ri);
+  case X86::ATOMOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 
+                                                       X86::OR32ri);
+  case X86::ATOMXOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
+                                                       X86::XOR32ri);
+  case X86::ATOMMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
+  case X86::ATOMMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
+  case X86::ATOMUMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
+  case X86::ATOMUMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
  }
 }

--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -346,6 +346,7 @@ namespace llvm {
    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
                                                        MachineBasicBlock *MBB);

+ 
    /// getTargetNodeName - This method returns the name of a target specific
    /// DAG node.
    virtual const char *getTargetNodeName(unsigned Opcode) const;
@ -524,6 +525,7 @@ namespace llvm {
    SDNode *ExpandFP_TO_SINT(SDNode *N, SelectionDAG &DAG);
    SDNode *ExpandREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG);
    SDNode *ExpandATOMIC_LCS(SDNode *N, SelectionDAG &DAG);
+    SDNode *ExpandATOMIC_LSS(SDNode *N, SelectionDAG &DAG);
    
    SDOperand EmitTargetCodeForMemset(SelectionDAG &DAG,
                                      SDOperand Chain,
@ -537,6 +539,23 @@ namespace llvm {
                                      bool AlwaysInline,
                                      const Value *DstSV, uint64_t DstSVOff,
                                      const Value *SrcSV, uint64_t SrcSVOff);
+    
+    /// Utility function to emit atomic bitwise operations (and, or, xor).
+    // It takes the bitwise instruction to expand, the associated machine basic
+    // block, and the associated X86 opcodes for reg/reg and reg/imm.
+    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
+                                                    MachineInstr *BInstr,
+                                                    MachineBasicBlock *BB,
+                                                    unsigned regOpc,
+                                                    unsigned immOpc);
+    
+    /// Utility function to emit atomic min and max.  It takes the min/max
+    // instruction to expand, the associated basic block, and the associated
+    // cmov opcode for moving the min or max value.
+    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned cmovOpc);
+    
  };
 }

--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@ -2598,6 +2598,63 @@ def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
                TB, LOCK;
 }

+// Atomic exchange and and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMAND32 : I<0xC1, MRMSrcMem,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMAND32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_and addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMOR32 : I<0xC1, MRMSrcMem, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMOR32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_or addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMXOR32 : I<0xC1, MRMSrcMem,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMXOR32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_xor addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMMIN32: I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+               "#ATOMMIN32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_min addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMMAX32: I<0xC1, MRMSrcMem, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMMAX32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_max addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMUMIN32: I<0xC1, MRMSrcMem,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMIN32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_umin addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomDAGSchedInserter = 1 in {
+def ATOMUMAX32: I<0xC1, MRMSrcMem,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMAX32 PSUEDO!", 
+               [(set GR32:$dst, (atomic_load_umax addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+}
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@ -0,0 +1,93 @@
+; RUN: llvm-as < %s | llc -march=x86 -o %t1 -f
+; RUN: grep "lock xaddl" %t1 | count 4 
+; RUN: grep "lock cmpxchgl"  %t1 | count 13 
+; RUN: grep "xchgl" %t1 | count 14
+; RUN: grep "cmova" %t1 | count 2
+; RUN: grep "cmovb" %t1 | count 2
+; RUN: grep "cmovg" %t1 | count 2
+; RUN: grep "cmovl" %t1 | count 2
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+define void @main(i32 %argc, i8** %argv) {
+entry:
+	%argc.addr = alloca i32		; <i32*> [#uses=1]
+	%argv.addr = alloca i8**		; <i8***> [#uses=1]
+	%val1 = alloca i32		; <i32*> [#uses=2]
+	%val2 = alloca i32		; <i32*> [#uses=15]
+	%andt = alloca i32		; <i32*> [#uses=2]
+	%ort = alloca i32		; <i32*> [#uses=2]
+	%xort = alloca i32		; <i32*> [#uses=2]
+	%old = alloca i32		; <i32*> [#uses=18]
+	%temp = alloca i32		; <i32*> [#uses=2]
+	store i32 %argc, i32* %argc.addr
+	store i8** %argv, i8*** %argv.addr
+	store i32 0, i32* %val1
+	store i32 31, i32* %val2
+	store i32 3855, i32* %andt
+	store i32 3855, i32* %ort
+	store i32 3855, i32* %xort
+	store i32 4, i32* %temp
+	%tmp = load i32* %temp		; <i32> [#uses=1]
+	call i32 @llvm.atomic.las.i32( i32* %val1, i32 %tmp )		; <i32>:0 [#uses=1]
+	store i32 %0, i32* %old
+	call i32 @llvm.atomic.lss.i32( i32* %val2, i32 30 )		; <i32>:1 [#uses=1]
+	store i32 %1, i32* %old
+	call i32 @llvm.atomic.las.i32( i32* %val2, i32 1 )		; <i32>:2 [#uses=1]
+	store i32 %2, i32* %old
+	call i32 @llvm.atomic.lss.i32( i32* %val2, i32 1 )		; <i32>:3 [#uses=1]
+	store i32 %3, i32* %old
+	call i32 @llvm.atomic.load.and.i32( i32* %andt, i32 4080 )		; <i32>:4 [#uses=1]
+	store i32 %4, i32* %old
+	call i32 @llvm.atomic.load.or.i32( i32* %ort, i32 4080 )		; <i32>:5 [#uses=1]
+	store i32 %5, i32* %old
+	call i32 @llvm.atomic.load.xor.i32( i32* %xort, i32 4080 )		; <i32>:6 [#uses=1]
+	store i32 %6, i32* %old
+	call i32 @llvm.atomic.load.min.i32( i32* %val2, i32 16 )		; <i32>:7 [#uses=1]
+	store i32 %7, i32* %old
+	%neg = sub i32 0, 1		; <i32> [#uses=1]
+	call i32 @llvm.atomic.load.min.i32( i32* %val2, i32 %neg )		; <i32>:8 [#uses=1]
+	store i32 %8, i32* %old
+	call i32 @llvm.atomic.load.max.i32( i32* %val2, i32 1 )		; <i32>:9 [#uses=1]
+	store i32 %9, i32* %old
+	call i32 @llvm.atomic.load.max.i32( i32* %val2, i32 0 )		; <i32>:10 [#uses=1]
+	store i32 %10, i32* %old
+	call i32 @llvm.atomic.load.umax.i32( i32* %val2, i32 65535 )		; <i32>:11 [#uses=1]
+	store i32 %11, i32* %old
+	call i32 @llvm.atomic.load.umax.i32( i32* %val2, i32 10 )		; <i32>:12 [#uses=1]
+	store i32 %12, i32* %old
+	call i32 @llvm.atomic.load.umin.i32( i32* %val2, i32 1 )		; <i32>:13 [#uses=1]
+	store i32 %13, i32* %old
+	call i32 @llvm.atomic.load.umin.i32( i32* %val2, i32 10 )		; <i32>:14 [#uses=1]
+	store i32 %14, i32* %old
+	call i32 @llvm.atomic.swap.i32( i32* %val2, i32 1976 )		; <i32>:15 [#uses=1]
+	store i32 %15, i32* %old
+	%neg1 = sub i32 0, 10		; <i32> [#uses=1]
+	call i32 @llvm.atomic.lcs.i32( i32* %val2, i32 %neg1, i32 1 )		; <i32>:16 [#uses=1]
+	store i32 %16, i32* %old
+	call i32 @llvm.atomic.lcs.i32( i32* %val2, i32 1976, i32 1 )		; <i32>:17 [#uses=1]
+	store i32 %17, i32* %old
+	ret void
+}
+
+declare i32 @llvm.atomic.las.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.lss.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.and.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.or.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.xor.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.min.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.max.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.umax.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.umin.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.swap.i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.lcs.i32(i32*, i32, i32) nounwind