Allow FP types for atomicrmw xchg

llvm-svn: 351427
2025-01-31 12:41:49 +01:00 · 2019-01-17 10:49:01 +00:00 · 2019-01-17 10:49:01 +00:00 · afd1f8cb4f
commit afd1f8cb4f
parent 04b4351561
24 changed files with 375 additions and 16 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -8584,13 +8584,14 @@ operation. The operation must be one of the following keywords:
 -  umax
 -  umin

-The type of '<value>' must be an integer type whose bit width is a power
-of two greater than or equal to eight and less than or equal to a
-target-specific size limit. The type of the '``<pointer>``' operand must
-be a pointer to that type. If the ``atomicrmw`` is marked as
-``volatile``, then the optimizer is not allowed to modify the number or
-order of execution of this ``atomicrmw`` with other :ref:`volatile
-operations <volatile>`.
+For most of these operations, the type of '<value>' must be an integer
+type whose bit width is a power of two greater than or equal to eight
+and less than or equal to a target-specific size limit. For xchg, this
+may also be a floating point type with the same size constraints as
+integers. The type of the '``<pointer>``' operand must be a pointer to
+that type. If the ``atomicrmw`` is marked as ``volatile``, then the
+optimizer is not allowed to modify the number or order of execution of
+this ``atomicrmw`` with other :ref:`volatile operations <volatile>`.

 A ``atomicrmw`` instruction can also take an optional
 ":ref:`syncscope <syncscope>`" argument.
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@ -6850,12 +6850,20 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
  if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
    return Error(ValLoc, "atomicrmw value and pointer type do not match");

-  if (!Val->getType()->isIntegerTy()) {
+  if (Operation != AtomicRMWInst::Xchg && !Val->getType()->isIntegerTy()) {
    return Error(ValLoc, "atomicrmw " +
                 AtomicRMWInst::getOperationName(Operation) +
                 " operand must be an integer");
  }

+  if (Operation == AtomicRMWInst::Xchg &&
+      !Val->getType()->isIntegerTy() &&
+      !Val->getType()->isFloatingPointTy()) {
+    return Error(ValLoc, "atomicrmw " +
+                 AtomicRMWInst::getOperationName(Operation) +
+                 " operand must be an integer or floating point type");
+  }
+
  unsigned Size = Val->getType()->getPrimitiveSizeInBits();
  if (Size < 8 || (Size & (Size - 1)))
    return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@ -496,11 +496,26 @@ static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
                                 Value *Loaded, Value *NewVal,
                                 AtomicOrdering MemOpOrder,
                                 Value *&Success, Value *&NewLoaded) {
+  Type *OrigTy = NewVal->getType();
+
+  // This code can go away when cmpxchg supports FP types.
+  bool NeedBitcast = OrigTy->isFloatingPointTy();
+  if (NeedBitcast) {
+    IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
+    unsigned AS = Addr->getType()->getPointerAddressSpace();
+    Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS));
+    NewVal = Builder.CreateBitCast(NewVal, IntTy);
+    Loaded = Builder.CreateBitCast(Loaded, IntTy);
+  }
+
  Value* Pair = Builder.CreateAtomicCmpXchg(
      Addr, Loaded, NewVal, MemOpOrder,
      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
  Success = Builder.CreateExtractValue(Pair, 1, "success");
  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+  if (NeedBitcast)
+    NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
 }

 /// Emit IR to implement the given atomicrmw operation on values in registers,
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -4532,6 +4532,24 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
    Results.push_back(CvtVec);
    break;
  }
+  case ISD::ATOMIC_SWAP: {
+    AtomicSDNode *AM = cast<AtomicSDNode>(Node);
+    SDLoc SL(Node);
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NVT, AM->getVal());
+    assert(NVT.getSizeInBits() == OVT.getSizeInBits() &&
+           "unexpected promotion type");
+    assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
+           "unexpected atomic_swap with illegal type");
+
+    SDValue NewAtomic
+      = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, NVT,
+                      DAG.getVTList(NVT, MVT::Other),
+                      { AM->getChain(), AM->getBasePtr(), CastVal },
+                      AM->getMemOperand());
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
+    Results.push_back(NewAtomic.getValue(1));
+    break;
+  }
  }

  // Replace the original node with the legalized result.
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@ -104,6 +104,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
    case ISD::SINT_TO_FP:
@ -1932,7 +1933,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
    case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
-
+    case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
  }

  if (R.getNode())
@ -2166,3 +2167,29 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
                                               N->getValueType(0)));
 }

+SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+  SDLoc SL(N);
+
+  SDValue CastVal = BitConvertToInteger(AM->getVal());
+  EVT CastVT = CastVal.getValueType();
+
+  SDValue NewAtomic
+    = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, CastVT,
+                    DAG.getVTList(CastVT, MVT::Other),
+                    { AM->getChain(), AM->getBasePtr(), CastVal },
+                    AM->getMemOperand());
+
+  SDValue ResultCast = DAG.getNode(GetPromotionOpcode(VT, NFPVT), SL, NFPVT,
+                                   NewAtomic);
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewAtomic.getValue(1));
+
+  return ResultCast;
+
+}
+
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -640,6 +640,7 @@ private:
  SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
  SDValue PromoteFloatRes_UnaryOp(SDNode *N);
  SDValue PromoteFloatRes_UNDEF(SDNode *N);
+  SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N);
  SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);

  bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -583,6 +583,14 @@ void TargetLoweringBase::initActions() {
  std::fill(std::begin(TargetDAGCombineArray),
            std::end(TargetDAGCombineArray), 0);

+  for (MVT VT : MVT::fp_valuetypes()) {
+    MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
+    if (IntVT.isValid()) {
+      setOperationAction(ISD::ATOMIC_SWAP, VT, Promote);
+      AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT);
+    }
+  }
+
  // Set default actions for various operations.
  for (MVT VT : MVT::all_valuetypes()) {
    // Default all indexed load / store to expand.
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@ -3431,10 +3431,17 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
  PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
  Assert(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
  Type *ElTy = PTy->getElementType();
-  Assert(ElTy->isIntegerTy(), "atomicrmw " +
-         AtomicRMWInst::getOperationName(Op) +
-         " operand must have integer type!",
-         &RMWI, ElTy);
+  if (Op == AtomicRMWInst::Xchg) {
+    Assert(ElTy->isIntegerTy() || ElTy->isFloatingPointTy(), "atomicrmw " +
+           AtomicRMWInst::getOperationName(Op) +
+           " operand must have integer or floating point type!",
+           &RMWI, ElTy);
+  } else {
+    Assert(ElTy->isIntegerTy(), "atomicrmw " +
+           AtomicRMWInst::getOperationName(Op) +
+           " operand must have integer type!",
+           &RMWI, ElTy);
+  }
  checkAtomicMemAccessSize(ElTy, &RMWI);
  Assert(ElTy == RMWI.getOperand(1)->getType(),
         "Argument value type does not match pointer operand type!", &RMWI,
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -11655,9 +11655,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

-  return Builder.CreateTruncOrBitCast(
-      Builder.CreateCall(Ldxr, Addr),
-      cast<PointerType>(Addr->getType())->getElementType());
+  Type *EltTy = cast<PointerType>(Addr->getType())->getElementType();
+
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(EltTy));
+  Value *Trunc = Builder.CreateTrunc(Builder.CreateCall(Ldxr, Addr), IntEltTy);
+
+  return Builder.CreateBitCast(Trunc, EltTy);
 }

 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
@ -11692,6 +11696,10 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
  Type *Tys[] = { Addr->getType() };
  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
+  Val = Builder.CreateBitCast(Val, IntValTy);
+
  return Builder.CreateCall(Stxr,
                            {Builder.CreateZExtOrBitCast(
                                 Val, Stxr->getFunctionType()->getParamType(0)),
--- a/test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll
+++ b/test/Assembler/invalid-atomicrmw-xchg-must-be-integer-or-fp-type.ll
@ -0,0 +1,7 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: error: atomicrmw xchg operand must be an integer or floating point type
+define void @f(i32** %ptr) {
+  atomicrmw xchg i32** %ptr, i32* null seq_cst
+  ret void
+}
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@ -761,6 +761,12 @@ define void @atomics(i32* %word) {
  ret void
 }

+define void @fp_atomics(float* %word) {
+; CHECK: %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.000000e+00 monotonic
+  %atomicrmw.xchg = atomicrmw xchg float* %word, float 1.0 monotonic
+  ret void
+}
+
 ;; Fast Math Flags
 define void @fastmathflags_unop(float %op1) {
  %f.nnan = fneg nnan float %op1
--- a/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics.ll
@ -703,6 +703,16 @@ entry:
  ret void
 }

+; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
+; CIVI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
+; GFX9: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
+define amdgpu_kernel void @atomic_xchg_f32_offset(float* %out, float %in) {
+entry:
+  %gep = getelementptr float, float* %out, i32 4
+  %val = atomicrmw volatile xchg float* %gep, float %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; CIVI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GFX9: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
--- a/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@ -650,6 +650,15 @@ entry:
  ret void
 }

+; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
+; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
+define amdgpu_kernel void @atomic_xchg_f64_offset(double* %out, double %in) {
+entry:
+  %gep = getelementptr double, double* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg double* %gep, double %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@ -839,6 +839,17 @@ entry:
  ret void
 }

+; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
+; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
+
+; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) {
+entry:
+  %gep = getelementptr float, float addrspace(1)* %out, i64 4
+  %val = atomicrmw volatile xchg float addrspace(1)* %gep, float %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; SIVI: buffer_store_dword [[RET]]
--- a/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll
@ -783,6 +783,17 @@ entry:
  ret void
 }

+; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
+; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
+
+; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) {
+entry:
+  %gep = getelementptr double, double addrspace(1)* %out, i64 4
+  %tmp0 = atomicrmw volatile xchg double addrspace(1)* %gep, double %in seq_cst
+  ret void
+}
+
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; CIVI: buffer_store_dwordx2 [[RET]]
--- a/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@ -36,6 +36,20 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out
  ret void
 }

+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_f32_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; EG: LDS_WRXCHG_RET *
+; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; GCN: s_endpgm
+define amdgpu_kernel void @lds_atomic_xchg_ret_f32_offset(float addrspace(1)* %out, float addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr float, float addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg float addrspace(3)* %gep, float 4.0 seq_cst
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
 ; XXX - Is it really necessary to load 4 into VGPR?
 ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
 ; EG: LDS_ADD_RET *
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@ -27,6 +27,19 @@ define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out
  ret void
 }

+; GCN-LABEL: {{^}}lds_atomic_xchg_ret_f64_offset:
+; SICIVI: s_mov_b32 m0
+; GFX9-NOT: m0
+
+; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; GCN: s_endpgm
+define amdgpu_kernel void @lds_atomic_xchg_ret_f64_offset(double addrspace(1)* %out, double addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr double, double addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg double addrspace(3)* %gep, double 4.0 seq_cst
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}
+
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; SICIVI: s_mov_b32 m0
 ; GFX9-NOT: m0
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@ -360,3 +360,27 @@ define void @atomic_store_relaxed(i128* %p, i128 %in) {
   store atomic i128 %in, i128* %p unordered, align 16
   ret void
 }
+
+
+@fsc128 = external global fp128
+
+define void @atomic_fetch_swapf128(fp128 %x) nounwind {
+; CHECK-LABEL: atomic_fetch_swapf128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rsi, %rcx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq _fsc128@{{.*}}(%rip), %rsi
+; CHECK-NEXT:    movq (%rsi), %rax
+; CHECK-NEXT:    movq 8(%rsi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB14_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lock cmpxchg16b (%rsi)
+; CHECK-NEXT:    jne LBB14_1
+; CHECK-NEXT:  ## %bb.2: ## %atomicrmw.end
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %t1 = atomicrmw xchg fp128* @fsc128, fp128 %x acquire
+  ret void
+}
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@ -2,6 +2,7 @@
 ; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32

@sc16 = external global i16
+@fsc16 = external global half

 define void @atomic_fetch_add16() nounwind {
 ; X64-LABEL:   atomic_fetch_add16
@ -273,3 +274,14 @@ define void @atomic_fetch_swap16(i16 %x) nounwind {
 ; X64:       ret
 ; X32:       ret
 }
+
+define void @atomic_fetch_swapf16(half %x) nounwind {
+  %t1 = atomicrmw xchg half* @fsc16, half %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@ -4,6 +4,7 @@
 ; RUN: llc < %s -O0 -mtriple=i686-unknown-unknown -mcpu=corei7 -mattr=-cmov,-sse -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOCMOV

@sc32 = external global i32
+@fsc32 = external global float

 define void @atomic_fetch_add32() nounwind {
 ; X64-LABEL: atomic_fetch_add32:
@ -708,3 +709,35 @@ define void @atomic_fetch_swap32(i32 %x) nounwind {
  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
  ret void
 }
+
+define void @atomic_fetch_swapf32(float %x) nounwind {
+; X64-LABEL: atomic_fetch_swapf32:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    xchgl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    retq
+;
+; X86-CMOV-LABEL: atomic_fetch_swapf32:
+; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %eax
+; X86-CMOV-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-CMOV-NEXT:    movd %xmm0, %eax
+; X86-CMOV-NEXT:    xchgl %eax, fsc32
+; X86-CMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    popl %eax
+; X86-CMOV-NEXT:    retl
+;
+; X86-NOCMOV-LABEL: atomic_fetch_swapf32:
+; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    subl $8, %esp
+; X86-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    xchgl %eax, fsc32
+; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    addl $8, %esp
+; X86-NOCMOV-NEXT:    retl
+  %t1 = atomicrmw xchg float* @fsc32, float %x acquire
+  ret void
+}
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@ -1,6 +1,7 @@
 ; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64

@sc64 = external global i64
+@fsc64 = external global double

 define void @atomic_fetch_add64() nounwind {
 ; X64-LABEL:   atomic_fetch_add64:
@ -233,3 +234,16 @@ define void @atomic_fetch_swap64(i64 %x) nounwind {
 ; X64:       ret
 ; X32:       ret
 }
+
+define void @atomic_fetch_swapf64(double %x) nounwind {
+; X64-LABEL:   atomic_fetch_swapf64:
+; X32-LABEL:   atomic_fetch_swapf64:
+  %t1 = atomicrmw xchg double* @fsc64, double %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
--- a/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s
+
+define void @atomic_swap_f16(half* %ptr, half %val) nounwind {
+; CHECK-LABEL: @atomic_swap_f16(
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f16(half* [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half [[VAL:%.*]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0f16(i64 [[TMP5]], half* [[PTR]])
+; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    ret void
+;
+  %t1 = atomicrmw xchg half* %ptr, half %val acquire
+  ret void
+}
+
+define void @atomic_swap_f32(float* %ptr, float %val) nounwind {
+; CHECK-LABEL: @atomic_swap_f32(
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f32(float* [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0f32(i64 [[TMP5]], float* [[PTR]])
+; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    ret void
+;
+  %t1 = atomicrmw xchg float* %ptr, float %val acquire
+  ret void
+}
+
+define void @atomic_swap_f64(double* %ptr, double %val) nounwind {
+; CHECK-LABEL: @atomic_swap_f64(
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.aarch64.ldaxr.p0f64(double* [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[VAL:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.aarch64.stxr.p0f64(i64 [[TMP3]], double* [[PTR]])
+; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    ret void
+;
+  %t1 = atomicrmw xchg double* %ptr, double %val acquire
+  ret void
+}
--- a/test/Transforms/AtomicExpand/AArch64/lit.local.cfg
+++ b/test/Transforms/AtomicExpand/AArch64/lit.local.cfg
@ -0,0 +1,3 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+
--- a/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
+++ b/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=i686-linux-gnu -atomic-expand %s | FileCheck %s
+
+define double @atomic_xchg_f64(double* %ptr) nounwind {
+; CHECK-LABEL: @atomic_xchg_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[PTR]] to i64*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP3]], i64 4616189618054758400 seq_cst seq_cst
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    ret double [[TMP5]]
+;
+  %result = atomicrmw xchg double* %ptr, double 4.0 seq_cst
+  ret double %result
+}
+
+define double @atomic_xchg_f64_as1(double addrspace(1)* %ptr) nounwind {
+; CHECK-LABEL: @atomic_xchg_f64_as1(
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; CHECK:       atomicrmw.start:
+; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP3]], i64 4616189618054758400 seq_cst seq_cst
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    ret double [[TMP5]]
+;
+  %result = atomicrmw xchg double addrspace(1)* %ptr, double 4.0 seq_cst
+  ret double %result
+}