2017-08-02 00:20:41 +02:00
|
|
|
; RUN: llc < %s -mtriple=powerpc-apple-darwin -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
|
2014-09-23 22:46:49 +02:00
|
|
|
; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
|
|
|
|
; This is already checked for in Atomics-64.ll
|
2017-08-02 00:20:41 +02:00
|
|
|
; RUN: llc < %s -mtriple=powerpc64-apple-darwin | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
|
2014-09-23 22:46:49 +02:00
|
|
|
|
|
|
|
; FIXME: we don't currently check for the operations themselves with CHECK-NEXT,
|
|
|
|
; because they are implemented in a very messy way with lwarx/stwcx.
|
|
|
|
; It should be fixed soon in another patch.
|
|
|
|
|
|
|
|
; We first check loads, for all sizes from i8 to i64.
|
|
|
|
; We also vary orderings to check for barriers.
|
|
|
|
define i8 @load_i8_unordered(i8* %mem) {
|
|
|
|
; CHECK-LABEL: load_i8_unordered
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; CHECK: lbz
|
2014-09-23 22:46:49 +02:00
|
|
|
; CHECK-NOT: sync
|
2015-02-27 22:17:42 +01:00
|
|
|
%val = load atomic i8, i8* %mem unordered, align 1
|
2014-09-23 22:46:49 +02:00
|
|
|
ret i8 %val
|
|
|
|
}
|
|
|
|
define i16 @load_i16_monotonic(i16* %mem) {
|
|
|
|
; CHECK-LABEL: load_i16_monotonic
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; CHECK: lhz
|
2014-09-23 22:46:49 +02:00
|
|
|
; CHECK-NOT: sync
|
2015-02-27 22:17:42 +01:00
|
|
|
%val = load atomic i16, i16* %mem monotonic, align 2
|
2014-09-23 22:46:49 +02:00
|
|
|
ret i16 %val
|
|
|
|
}
|
|
|
|
define i32 @load_i32_acquire(i32* %mem) {
|
|
|
|
; CHECK-LABEL: load_i32_acquire
|
2017-05-16 22:18:06 +02:00
|
|
|
; CHECK: lwz [[VAL:r[0-9]+]]
|
2015-02-27 22:17:42 +01:00
|
|
|
%val = load atomic i32, i32* %mem acquire, align 4
|
2017-05-16 22:18:06 +02:00
|
|
|
; CHECK-PPC32: lwsync
|
|
|
|
; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
|
|
|
|
; CHECK-PPC64: bne- [[CR]], .+4
|
|
|
|
; CHECK-PPC64: isync
|
2014-09-23 22:46:49 +02:00
|
|
|
ret i32 %val
|
|
|
|
}
|
|
|
|
define i64 @load_i64_seq_cst(i64* %mem) {
|
|
|
|
; CHECK-LABEL: load_i64_seq_cst
|
2015-04-24 01:05:08 +02:00
|
|
|
; CHECK: sync
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; PPC32: __sync_
|
|
|
|
; PPC64-NOT: __sync_
|
2017-05-16 22:18:06 +02:00
|
|
|
; PPC64: ld [[VAL:r[0-9]+]]
|
2015-02-27 22:17:42 +01:00
|
|
|
%val = load atomic i64, i64* %mem seq_cst, align 8
|
2017-05-16 22:18:06 +02:00
|
|
|
; CHECK-PPC32: lwsync
|
|
|
|
; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
|
|
|
|
; CHECK-PPC64: bne- [[CR]], .+4
|
|
|
|
; CHECK-PPC64: isync
|
2014-09-23 22:46:49 +02:00
|
|
|
ret i64 %val
|
|
|
|
}
|
|
|
|
|
|
|
|
; Stores
|
|
|
|
define void @store_i8_unordered(i8* %mem) {
|
|
|
|
; CHECK-LABEL: store_i8_unordered
|
|
|
|
; CHECK-NOT: sync
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; CHECK: stb
|
2014-09-23 22:46:49 +02:00
|
|
|
store atomic i8 42, i8* %mem unordered, align 1
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
define void @store_i16_monotonic(i16* %mem) {
|
|
|
|
; CHECK-LABEL: store_i16_monotonic
|
|
|
|
; CHECK-NOT: sync
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; CHECK: sth
|
2014-09-23 22:46:49 +02:00
|
|
|
store atomic i16 42, i16* %mem monotonic, align 2
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
define void @store_i32_release(i32* %mem) {
|
|
|
|
; CHECK-LABEL: store_i32_release
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; CHECK: stw
|
2014-09-23 22:46:49 +02:00
|
|
|
store atomic i32 42, i32* %mem release, align 4
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
define void @store_i64_seq_cst(i64* %mem) {
|
|
|
|
; CHECK-LABEL: store_i64_seq_cst
|
2015-04-24 01:05:08 +02:00
|
|
|
; CHECK: sync
|
[Power] Improve the expansion of atomic loads/stores
Summary:
Atomic loads and store of up to the native size (32 bits, or 64 for PPC64)
can be lowered to a simple load or store instruction (as the synchronization
is already handled by AtomicExpand, and the atomicity is guaranteed thanks to
the alignment requirements of atomic accesses). This is exactly what this patch
does. Previously, these were implemented by complex
load-linked/store-conditional loops.. an obvious performance problem.
For example, this patch turns
```
define void @store_i8_unordered(i8* %mem) {
store atomic i8 42, i8* %mem unordered, align 1
ret void
}
```
from
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
rlwinm r2, r3, 3, 27, 28
li r4, 42
xori r5, r2, 24
rlwinm r2, r3, 0, 0, 29
li r3, 255
slw r4, r4, r5
slw r3, r3, r5
and r4, r4, r3
LBB4_1: ; =>This Inner Loop Header: Depth=1
lwarx r5, 0, r2
andc r5, r5, r3
or r5, r4, r5
stwcx. r5, 0, r2
bne cr0, LBB4_1
; BB#2:
blr
```
into
```
_store_i8_unordered: ; @store_i8_unordered
; BB#0:
li r2, 42
stb r2, 0(r3)
blr
```
which looks like a pretty clear win to me.
Test Plan:
fixed the tests + new test for indexed accesses + make check-all
Reviewers: jfb, wschmidt, hfinkel
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D5587
llvm-svn: 218922
2014-10-03 00:27:07 +02:00
|
|
|
; PPC32: __sync_
|
|
|
|
; PPC64-NOT: __sync_
|
|
|
|
; PPC64: std
|
2014-09-23 22:46:49 +02:00
|
|
|
store atomic i64 42, i64* %mem seq_cst, align 8
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; Atomic CmpXchg
|
|
|
|
define i8 @cas_strong_i8_sc_sc(i8* %mem) {
|
|
|
|
; CHECK-LABEL: cas_strong_i8_sc_sc
|
2015-04-24 01:05:08 +02:00
|
|
|
; CHECK: sync
|
2014-09-23 22:46:49 +02:00
|
|
|
%val = cmpxchg i8* %mem, i8 0, i8 1 seq_cst seq_cst
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%loaded = extractvalue { i8, i1} %val, 0
|
|
|
|
ret i8 %loaded
|
|
|
|
}
|
|
|
|
define i16 @cas_weak_i16_acquire_acquire(i16* %mem) {
|
|
|
|
; CHECK-LABEL: cas_weak_i16_acquire_acquire
|
|
|
|
;CHECK-NOT: sync
|
|
|
|
%val = cmpxchg weak i16* %mem, i16 0, i16 1 acquire acquire
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%loaded = extractvalue { i16, i1} %val, 0
|
|
|
|
ret i16 %loaded
|
|
|
|
}
|
|
|
|
define i32 @cas_strong_i32_acqrel_acquire(i32* %mem) {
|
|
|
|
; CHECK-LABEL: cas_strong_i32_acqrel_acquire
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%val = cmpxchg i32* %mem, i32 0, i32 1 acq_rel acquire
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%loaded = extractvalue { i32, i1} %val, 0
|
|
|
|
ret i32 %loaded
|
|
|
|
}
|
|
|
|
define i64 @cas_weak_i64_release_monotonic(i64* %mem) {
|
|
|
|
; CHECK-LABEL: cas_weak_i64_release_monotonic
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic
|
|
|
|
; CHECK-NOT: [sync ]
|
|
|
|
%loaded = extractvalue { i64, i1} %val, 0
|
|
|
|
ret i64 %loaded
|
|
|
|
}
|
|
|
|
|
|
|
|
; AtomicRMW
|
|
|
|
define i8 @add_i8_monotonic(i8* %mem, i8 %operand) {
|
|
|
|
; CHECK-LABEL: add_i8_monotonic
|
|
|
|
; CHECK-NOT: sync
|
|
|
|
%val = atomicrmw add i8* %mem, i8 %operand monotonic
|
|
|
|
ret i8 %val
|
|
|
|
}
|
|
|
|
define i16 @xor_i16_seq_cst(i16* %mem, i16 %operand) {
|
|
|
|
; CHECK-LABEL: xor_i16_seq_cst
|
2015-04-24 01:05:08 +02:00
|
|
|
; CHECK: sync
|
2014-09-23 22:46:49 +02:00
|
|
|
%val = atomicrmw xor i16* %mem, i16 %operand seq_cst
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
ret i16 %val
|
|
|
|
}
|
|
|
|
define i32 @xchg_i32_acq_rel(i32* %mem, i32 %operand) {
|
|
|
|
; CHECK-LABEL: xchg_i32_acq_rel
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%val = atomicrmw xchg i32* %mem, i32 %operand acq_rel
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
ret i32 %val
|
|
|
|
}
|
|
|
|
define i64 @and_i64_release(i64* %mem, i64 %operand) {
|
|
|
|
; CHECK-LABEL: and_i64_release
|
2015-04-23 20:30:38 +02:00
|
|
|
; CHECK: lwsync
|
2014-09-23 22:46:49 +02:00
|
|
|
%val = atomicrmw and i64* %mem, i64 %operand release
|
|
|
|
; CHECK-NOT: [sync ]
|
|
|
|
ret i64 %val
|
|
|
|
}
|