mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
Use an offset from TOS for idempotent rmw locked op lowering
This was the portion split off D58632 so that it could follow the redzone API cleanup. Note that I changed the offset preferred from -8 to -64. The difference should be very minor, but I thought it might help address one concern which had been previously raised. Differential Revision: https://reviews.llvm.org/D61862 llvm-svn: 360719
This commit is contained in:
parent
4121656a8f
commit
7ef5656249
@ -26292,21 +26292,31 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
|
||||
// here since it doesn't require an extra register.
|
||||
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
|
||||
// is small enough it might just be measurement noise.)
|
||||
// 4) For the moment, we are using top of stack. This creates false sharing
|
||||
// with actual stack access/call sequences, and it would be better to use a
|
||||
// location within the redzone. For the moment, this is still better than an
|
||||
// mfence though. TODO: Revise the offset used when we can assume a redzone.
|
||||
// 4) When choosing offsets, there are several contributing factors:
|
||||
// a) If there's no redzone, we default to TOS. (We could allocate a cache
|
||||
// line aligned stack object to improve this case.)
|
||||
// b) To minimize our chances of introducing a false dependence, we prefer
|
||||
// to offset the stack usage from TOS slightly.
|
||||
// c) To minimize concerns about cross thread stack usage - in particular,
|
||||
// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
|
||||
// captures state in the TOS frame and accesses it from many threads -
|
||||
// we want to use an offset such that the offset is in a distinct cache
|
||||
// line from the TOS frame.
|
||||
//
|
||||
// For a general discussion of the tradeoffs and benchmark results, see:
|
||||
// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
|
||||
|
||||
auto &MF = DAG.getMachineFunction();
|
||||
auto &TFL = *Subtarget.getFrameLowering();
|
||||
const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
|
||||
|
||||
if (Subtarget.is64Bit()) {
|
||||
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
|
||||
SDValue Ops[] = {
|
||||
DAG.getRegister(X86::RSP, MVT::i64), // Base
|
||||
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
|
||||
DAG.getRegister(0, MVT::i64), // Index
|
||||
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
|
||||
DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
|
||||
DAG.getRegister(0, MVT::i16), // Segment.
|
||||
Zero,
|
||||
Chain};
|
||||
@ -26320,7 +26330,7 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
|
||||
DAG.getRegister(X86::ESP, MVT::i32), // Base
|
||||
DAG.getTargetConstant(1, DL, MVT::i8), // Scale
|
||||
DAG.getRegister(0, MVT::i32), // Index
|
||||
DAG.getTargetConstant(0, DL, MVT::i32), // Disp
|
||||
DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
|
||||
DAG.getRegister(0, MVT::i16), // Segment.
|
||||
Zero,
|
||||
Chain
|
||||
|
@ -205,7 +205,7 @@ define void @or32_nouse_acq_rel(i32* %p) {
|
||||
define void @or32_nouse_seq_cst(i32* %p) {
|
||||
; X64-LABEL: or32_nouse_seq_cst:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: lock orl $0, (%rsp)
|
||||
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: or32_nouse_seq_cst:
|
||||
@ -220,7 +220,7 @@ define void @or32_nouse_seq_cst(i32* %p) {
|
||||
define void @or64_nouse_seq_cst(i64* %p) {
|
||||
; X64-LABEL: or64_nouse_seq_cst:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: lock orl $0, (%rsp)
|
||||
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: or64_nouse_seq_cst:
|
||||
@ -294,7 +294,7 @@ define void @or128_nouse_seq_cst(i128* %p) {
|
||||
define void @or16_nouse_seq_cst(i16* %p) {
|
||||
; X64-LABEL: or16_nouse_seq_cst:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: lock orl $0, (%rsp)
|
||||
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: or16_nouse_seq_cst:
|
||||
@ -308,7 +308,7 @@ define void @or16_nouse_seq_cst(i16* %p) {
|
||||
define void @or8_nouse_seq_cst(i8* %p) {
|
||||
; X64-LABEL: or8_nouse_seq_cst:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: lock orl $0, (%rsp)
|
||||
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: or8_nouse_seq_cst:
|
||||
|
@ -1151,14 +1151,14 @@ define void @idempotent_atomic(i32* %x) speculative_load_hardening {
|
||||
; X64-NEXT: movq %rsp, %rax
|
||||
; X64-NEXT: movq $-1, %rcx
|
||||
; X64-NEXT: sarq $63, %rax
|
||||
; X64-NEXT: lock orl $0, (%rsp)
|
||||
; X64-NEXT: lock orl $0, -64(%rsp)
|
||||
; X64-NEXT: shlq $47, %rax
|
||||
; X64-NEXT: orq %rax, %rsp
|
||||
; X64-NEXT: retq
|
||||
;
|
||||
; X64-LFENCE-LABEL: idempotent_atomic:
|
||||
; X64-LFENCE: # %bb.0:
|
||||
; X64-LFENCE-NEXT: lock orl $0, (%rsp)
|
||||
; X64-LFENCE-NEXT: lock orl $0, -64(%rsp)
|
||||
; X64-LFENCE-NEXT: retq
|
||||
%tmp = atomicrmw or i32* %x, i32 0 seq_cst
|
||||
ret void
|
||||
|
Loading…
x
Reference in New Issue
Block a user