mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[AMDGPU] Add volatile support to SIMemoryLegalizer
Treat a non-atomic volatile load and store as a relaxed atomic at system scope for the address spaces accessed. This will ensure all relevant caches will be bypassed. A volatile atomic is not changed and still only bypasses caches upto the level specified by the SyncScope operand. Differential Revision: https://reviews.llvm.org/D94214
This commit is contained in:
parent
29f461772d
commit
ccb53c0a97
@ -4736,18 +4736,48 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
|
||||
|
||||
1. buffer/global/flat_load
|
||||
glc=1
|
||||
2. s_waitcnt vmcnt(0)
|
||||
|
||||
- nontemporal
|
||||
- Must happen before
|
||||
any following volatile
|
||||
global/generic
|
||||
load/store.
|
||||
- Ensures that
|
||||
volatile
|
||||
operations to
|
||||
different
|
||||
addresses will not
|
||||
be reordered by
|
||||
hardware.
|
||||
|
||||
- !volatile & nontemporal
|
||||
|
||||
1. buffer/global/flat_load
|
||||
glc=1 slc=1
|
||||
|
||||
load *none* *none* - local 1. ds_load
|
||||
store *none* *none* - global - !nontemporal
|
||||
store *none* *none* - global - !volatile & !nontemporal
|
||||
- generic
|
||||
- private 1. buffer/global/flat_store
|
||||
- constant
|
||||
- nontemporal
|
||||
- volatile & !nontemporal
|
||||
|
||||
1. buffer/global/flat_store
|
||||
2. s_waitcnt vmcnt(0)
|
||||
|
||||
- Must happen before
|
||||
any following volatile
|
||||
global/generic
|
||||
load/store.
|
||||
- Ensures that
|
||||
volatile
|
||||
operations to
|
||||
different
|
||||
addresses will not
|
||||
be reordered by
|
||||
hardware.
|
||||
|
||||
- !volatile & nontemporal
|
||||
|
||||
1. buffer/global/flat_store
|
||||
glc=1 slc=1
|
||||
@ -6008,18 +6038,48 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
|
||||
|
||||
1. buffer/global/flat_load
|
||||
glc=1 dlc=1
|
||||
2. s_waitcnt vmcnt(0)
|
||||
|
||||
- nontemporal
|
||||
- Must happen before
|
||||
any following volatile
|
||||
global/generic
|
||||
load/store.
|
||||
- Ensures that
|
||||
volatile
|
||||
operations to
|
||||
different
|
||||
addresses will not
|
||||
be reordered by
|
||||
hardware.
|
||||
|
||||
- !volatile & nontemporal
|
||||
|
||||
1. buffer/global/flat_load
|
||||
slc=1
|
||||
|
||||
load *none* *none* - local 1. ds_load
|
||||
store *none* *none* - global - !nontemporal
|
||||
store *none* *none* - global - !volatile & !nontemporal
|
||||
- generic
|
||||
- private 1. buffer/global/flat_store
|
||||
- constant
|
||||
- nontemporal
|
||||
- volatile & !nontemporal
|
||||
|
||||
1. buffer/global/flat_store
|
||||
2. s_waitcnt vscnt(0)
|
||||
|
||||
- Must happen before
|
||||
any following volatile
|
||||
global/generic
|
||||
load/store.
|
||||
- Ensures that
|
||||
volatile
|
||||
operations to
|
||||
different
|
||||
addresses will not
|
||||
be reordered by
|
||||
hardware.
|
||||
|
||||
- !volatile & nontemporal
|
||||
|
||||
1. buffer/global/flat_store
|
||||
slc=1
|
||||
|
@ -110,6 +110,7 @@ private:
|
||||
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
|
||||
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
|
||||
bool IsCrossAddressSpaceOrdering = false;
|
||||
bool IsVolatile = false;
|
||||
bool IsNonTemporal = false;
|
||||
|
||||
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
|
||||
@ -119,11 +120,13 @@ private:
|
||||
bool IsCrossAddressSpaceOrdering = true,
|
||||
AtomicOrdering FailureOrdering =
|
||||
AtomicOrdering::SequentiallyConsistent,
|
||||
bool IsVolatile = false,
|
||||
bool IsNonTemporal = false)
|
||||
: Ordering(Ordering), FailureOrdering(FailureOrdering),
|
||||
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
|
||||
InstrAddrSpace(InstrAddrSpace),
|
||||
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
|
||||
IsVolatile(IsVolatile),
|
||||
IsNonTemporal(IsNonTemporal) {
|
||||
// There is also no cross address space ordering if the ordering
|
||||
// address space is the same as the instruction address space and
|
||||
@ -171,7 +174,13 @@ public:
|
||||
}
|
||||
|
||||
/// \returns True if memory access of the machine instruction used to
|
||||
/// create this SIMemOpInfo is non-temporal, false otherwise.
|
||||
/// create this SIMemOpInfo is volatile, false otherwise.
|
||||
bool isVolatile() const {
|
||||
return IsVolatile;
|
||||
}
|
||||
|
||||
/// \returns True if memory access of the machine instruction used to
|
||||
/// create this SIMemOpInfo is nontemporal, false otherwise.
|
||||
bool isNonTemporal() const {
|
||||
return IsNonTemporal;
|
||||
}
|
||||
@ -259,10 +268,13 @@ public:
|
||||
SIAtomicScope Scope,
|
||||
SIAtomicAddrSpace AddrSpace) const = 0;
|
||||
|
||||
/// Update \p MI memory instruction to indicate it is
|
||||
/// nontemporal. Return true iff the instruction was modified.
|
||||
virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
|
||||
const = 0;
|
||||
/// Update \p MI memory instruction of kind \p Op associated with address
|
||||
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
|
||||
/// true iff the instruction was modified.
|
||||
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
||||
SIAtomicAddrSpace AddrSpace,
|
||||
SIMemOp Op, bool IsVolatile,
|
||||
bool IsNonTemporal) const = 0;
|
||||
|
||||
/// Inserts any necessary instructions at position \p Pos relative
|
||||
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
|
||||
@ -328,7 +340,10 @@ public:
|
||||
SIAtomicScope Scope,
|
||||
SIAtomicAddrSpace AddrSpace) const override;
|
||||
|
||||
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
|
||||
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
||||
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
||||
bool IsVolatile,
|
||||
bool IsNonTemporal) const override;
|
||||
|
||||
bool insertWait(MachineBasicBlock::iterator &MI,
|
||||
SIAtomicScope Scope,
|
||||
@ -378,7 +393,10 @@ public:
|
||||
SIAtomicScope Scope,
|
||||
SIAtomicAddrSpace AddrSpace) const override;
|
||||
|
||||
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
|
||||
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
||||
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
||||
bool IsVolatile,
|
||||
bool IsNonTemporal) const override;
|
||||
|
||||
bool insertWait(MachineBasicBlock::iterator &MI,
|
||||
SIAtomicScope Scope,
|
||||
@ -529,11 +547,13 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
|
||||
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
|
||||
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
|
||||
bool IsNonTemporal = true;
|
||||
bool IsVolatile = false;
|
||||
|
||||
// Validator should check whether or not MMOs cover the entire set of
|
||||
// locations accessed by the memory instruction.
|
||||
for (const auto &MMO : MI->memoperands()) {
|
||||
IsNonTemporal &= MMO->isNonTemporal();
|
||||
IsVolatile |= MMO->isVolatile();
|
||||
InstrAddrSpace |=
|
||||
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
|
||||
AtomicOrdering OpOrdering = MMO->getOrdering();
|
||||
@ -576,7 +596,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
|
||||
}
|
||||
}
|
||||
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
|
||||
IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
|
||||
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
|
||||
IsNonTemporal);
|
||||
}
|
||||
|
||||
Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
|
||||
@ -703,14 +724,43 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SIGfx6CacheControl::enableNonTemporal(
|
||||
const MachineBasicBlock::iterator &MI) const {
|
||||
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
|
||||
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
||||
bool IsVolatile, bool IsNonTemporal) const {
|
||||
// Only handle load and store, not atomic read-modify-write insructions. The
|
||||
// latter use glc to indicate if the atomic returns a result and so must not
|
||||
// be used for cache control.
|
||||
assert(MI->mayLoad() ^ MI->mayStore());
|
||||
|
||||
// Only update load and store, not LLVM IR atomic read-modify-write
|
||||
// instructions. The latter are always marked as volatile so cannot sensibly
|
||||
// handle it as do not want to pessimize all atomics. Also they do not support
|
||||
// the nontemporal attribute.
|
||||
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
/// TODO: Do not enableGLCBit if rmw atomic.
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableSLCBit(MI);
|
||||
if (IsVolatile) {
|
||||
if (Op == SIMemOp::LOAD)
|
||||
Changed |= enableGLCBit(MI);
|
||||
|
||||
// Ensure operation has completed at system scope to cause all volatile
|
||||
// operations to be visible outside the program in a global order. Do not
|
||||
// request cross address space as only the global address space can be
|
||||
// observable outside the program, so no need to cause a waitcnt for LDS
|
||||
// address space operations.
|
||||
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
|
||||
Position::AFTER);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
if (IsNonTemporal) {
|
||||
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableSLCBit(MI);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
@ -732,7 +782,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
|
||||
bool VMCnt = false;
|
||||
bool LGKMCnt = false;
|
||||
|
||||
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
||||
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
|
||||
SIAtomicAddrSpace::NONE) {
|
||||
switch (Scope) {
|
||||
case SIAtomicScope::SYSTEM:
|
||||
case SIAtomicScope::AGENT:
|
||||
@ -959,13 +1010,45 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SIGfx10CacheControl::enableNonTemporal(
|
||||
const MachineBasicBlock::iterator &MI) const {
|
||||
bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
|
||||
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
||||
bool IsVolatile, bool IsNonTemporal) const {
|
||||
|
||||
// Only handle load and store, not atomic read-modify-write insructions. The
|
||||
// latter use glc to indicate if the atomic returns a result and so must not
|
||||
// be used for cache control.
|
||||
assert(MI->mayLoad() ^ MI->mayStore());
|
||||
|
||||
// Only update load and store, not LLVM IR atomic read-modify-write
|
||||
// instructions. The latter are always marked as volatile so cannot sensibly
|
||||
// handle it as do not want to pessimize all atomics. Also they do not support
|
||||
// the nontemporal attribute.
|
||||
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
Changed |= enableSLCBit(MI);
|
||||
/// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
|
||||
if (IsVolatile) {
|
||||
|
||||
if (Op == SIMemOp::LOAD) {
|
||||
Changed |= enableGLCBit(MI);
|
||||
Changed |= enableDLCBit(MI);
|
||||
}
|
||||
|
||||
// Ensure operation has completed at system scope to cause all volatile
|
||||
// operations to be visible outside the program in a global order. Do not
|
||||
// request cross address space as only the global address space can be
|
||||
// observable outside the program, so no need to cause a waitcnt for LDS
|
||||
// address space operations.
|
||||
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
|
||||
Position::AFTER);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
if (IsNonTemporal) {
|
||||
// Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
|
||||
Changed |= enableSLCBit(MI);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
@ -988,7 +1071,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
|
||||
bool VSCnt = false;
|
||||
bool LGKMCnt = false;
|
||||
|
||||
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
||||
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
|
||||
SIAtomicAddrSpace::NONE) {
|
||||
switch (Scope) {
|
||||
case SIAtomicScope::SYSTEM:
|
||||
case SIAtomicScope::AGENT:
|
||||
@ -1191,12 +1275,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// Atomic instructions do not have the nontemporal attribute.
|
||||
if (MOI.isNonTemporal()) {
|
||||
Changed |= CC->enableNonTemporal(MI);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// Atomic instructions already bypass caches to the scope specified by the
|
||||
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
|
||||
// need additional treatment.
|
||||
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
|
||||
SIMemOp::LOAD, MOI.isVolatile(),
|
||||
MOI.isNonTemporal());
|
||||
return Changed;
|
||||
}
|
||||
|
||||
@ -1217,12 +1301,12 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// Atomic instructions do not have the nontemporal attribute.
|
||||
if (MOI.isNonTemporal()) {
|
||||
Changed |= CC->enableNonTemporal(MI);
|
||||
return Changed;
|
||||
}
|
||||
|
||||
// Atomic instructions already bypass caches to the scope specified by the
|
||||
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
|
||||
// need additional treatment.
|
||||
Changed |= CC->enableVolatileAndOrNonTemporal(
|
||||
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
|
||||
MOI.isNonTemporal());
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
@ -59,9 +59,11 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: BB3_2: ; %bb1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 1
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
entry:
|
||||
%trunc = trunc i32 %cond to i1
|
||||
br i1 %trunc, label %bb0, label %bb1
|
||||
@ -88,9 +90,11 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: BB4_2: ; %bb1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 1
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
entry:
|
||||
%trunc0 = trunc i32 %cond0 to i1
|
||||
%trunc1 = trunc i32 %cond1 to i1
|
||||
|
@ -12,7 +12,8 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_cbranch_execz BB0_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %if.true
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: BB0_2: ; %endif
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -39,7 +40,8 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_cbranch_execz BB1_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %if.true
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: BB1_2: ; %endif
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -68,7 +70,8 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_cbranch_execz BB2_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %if.true
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: BB2_2: ; %endif
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -99,7 +102,8 @@ define i32 @divergent_if_nonboolean_condition1(i32 addrspace(1)* %ptr) {
|
||||
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; CHECK-NEXT: s_cbranch_execz BB3_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %if.true
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off
|
||||
; CHECK-NEXT: global_load_dword v0, v[0:1], off glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: BB3_2: ; %endif
|
||||
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -219,7 +223,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
|
||||
; CHECK-NEXT: s_cbranch_vccnz BB5_1
|
||||
; CHECK-NEXT: ; %bb.3: ; %bb4
|
||||
; CHECK-NEXT: ; in Loop: Header=BB5_2 Depth=1
|
||||
; CHECK-NEXT: global_load_dword v2, v[0:1], off
|
||||
; CHECK-NEXT: global_load_dword v2, v[0:1], off glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_ge_i32_e64 s[2:3], v0, v2
|
||||
; CHECK-NEXT: s_branch BB5_1
|
||||
|
@ -15,8 +15,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_add_u32 s1, 4, s1
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, 4, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_sindex_kernel:
|
||||
@ -34,7 +36,9 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
|
||||
; GFX10-NEXT: s_add_u32 s0, 4, s0
|
||||
; GFX10-NEXT: s_add_u32 s1, 4, s1
|
||||
; GFX10-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%i = alloca [32 x float], align 4, addrspace(5)
|
||||
@ -61,8 +65,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 15
|
||||
; GFX9-NEXT: scratch_store_dword v1, v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_vindex_kernel:
|
||||
@ -79,7 +85,9 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%i = alloca [32 x float], align 4, addrspace(5)
|
||||
@ -107,8 +115,9 @@ define void @store_load_vindex_foo(i32 %idx) {
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 15
|
||||
; GFX9-NEXT: scratch_store_dword v1, v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -124,9 +133,9 @@ define void @store_load_vindex_foo(i32 %idx) {
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
%i = alloca [32 x float], align 4, addrspace(5)
|
||||
@ -176,10 +185,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_add_u32 s1, 0x104, s1
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s2
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s2 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, 0x104, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_sindex_small_offset_kernel:
|
||||
@ -190,7 +202,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
|
||||
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
|
||||
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
|
||||
; GFX10-NEXT: s_add_u32 s1, 4, 0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -200,7 +212,9 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
|
||||
; GFX10-NEXT: s_add_u32 s0, 0x104, s0
|
||||
; GFX10-NEXT: s_add_u32 s1, 0x104, s1
|
||||
; GFX10-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%padding = alloca [64 x i32], align 4, addrspace(5)
|
||||
@ -224,7 +238,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
|
||||
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
||||
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
||||
; GFX9-NEXT: s_add_u32 s0, 4, 0
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
|
||||
@ -233,8 +247,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 15
|
||||
; GFX9-NEXT: scratch_store_dword v1, v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_vindex_small_offset_kernel:
|
||||
@ -251,9 +267,12 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%padding = alloca [64 x i32], align 4, addrspace(5)
|
||||
@ -278,9 +297,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s32, 0
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0
|
||||
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi
|
||||
@ -288,8 +307,9 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 15
|
||||
; GFX9-NEXT: scratch_store_dword v1, v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -306,11 +326,12 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
|
||||
; GFX10-NEXT: s_add_u32 s0, s32, 0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
%padding = alloca [64 x i32], align 4, addrspace(5)
|
||||
@ -341,10 +362,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
|
||||
; GFX9-NEXT: s_add_u32 s1, 0x4004, s1
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s2
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s2 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, 0x4004, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_sindex_large_offset_kernel:
|
||||
@ -355,7 +379,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
|
||||
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
|
||||
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
|
||||
; GFX10-NEXT: s_add_u32 s1, 4, 0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -365,7 +389,9 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
|
||||
; GFX10-NEXT: s_add_u32 s0, 0x4004, s0
|
||||
; GFX10-NEXT: s_add_u32 s1, 0x4004, s1
|
||||
; GFX10-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%padding = alloca [4096 x i32], align 4, addrspace(5)
|
||||
@ -389,7 +415,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
|
||||
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
||||
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
||||
; GFX9-NEXT: s_add_u32 s0, 4, 0
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
|
||||
@ -398,8 +424,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 15
|
||||
; GFX9-NEXT: scratch_store_dword v1, v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_vindex_large_offset_kernel:
|
||||
@ -416,9 +444,12 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%padding = alloca [4096 x i32], align 4, addrspace(5)
|
||||
@ -443,9 +474,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s32, 0
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0
|
||||
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
|
||||
; GFX9-NEXT: scratch_load_dword v1, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi
|
||||
@ -453,8 +484,9 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 15
|
||||
; GFX9-NEXT: scratch_store_dword v1, v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -471,11 +503,12 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
|
||||
; GFX10-NEXT: s_add_u32 s0, s32, 0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off
|
||||
; GFX10-NEXT: scratch_load_dword v2, off, s0 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: scratch_store_dword v0, v3, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
%padding = alloca [4096 x i32], align 4, addrspace(5)
|
||||
@ -501,11 +534,14 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 13
|
||||
; GFX9-NEXT: s_add_u32 s0, 4, 0
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: s_add_u32 s0, 4, s0
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_large_imm_offset_kernel:
|
||||
@ -520,8 +556,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
|
||||
; GFX10-NEXT: s_add_u32 s1, 4, 0
|
||||
; GFX10-NEXT: s_add_u32 s0, 4, s0
|
||||
; GFX10-NEXT: scratch_store_dword off, v0, s1
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_store_dword off, v1, s0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%i = alloca [4096 x i32], align 4, addrspace(5)
|
||||
@ -541,11 +580,13 @@ define void @store_load_large_imm_offset_foo() {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 13
|
||||
; GFX9-NEXT: s_add_u32 s0, s32, 0
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x3e80
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 15
|
||||
; GFX9-NEXT: s_add_u32 s0, s32, s0
|
||||
; GFX9-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -559,10 +600,11 @@ define void @store_load_large_imm_offset_foo() {
|
||||
; GFX10-NEXT: s_add_u32 s1, s32, 0
|
||||
; GFX10-NEXT: s_add_u32 s0, s32, s0
|
||||
; GFX10-NEXT: scratch_store_dword off, v0, s1
|
||||
; GFX10-NEXT: scratch_store_dword off, v1, s0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_store_dword off, v1, s0
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
%i = alloca [4096 x i32], align 4, addrspace(5)
|
||||
@ -586,7 +628,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
|
||||
; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
|
||||
; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: store_load_vidx_sidx_offset:
|
||||
@ -601,7 +645,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
|
||||
; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
|
||||
; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024
|
||||
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%alloca = alloca [32 x i32], align 4, addrspace(5)
|
||||
@ -621,7 +667,8 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
|
||||
; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -632,9 +679,9 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
|
||||
; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
store volatile i64 15, i64 addrspace(5)* %arg, align 8
|
||||
@ -649,7 +696,8 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
|
||||
; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -660,9 +708,9 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 15
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
|
||||
; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
store volatile i64 15, i64 addrspace(5)* %arg, align 1
|
||||
@ -681,7 +729,8 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off
|
||||
; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -696,9 +745,9 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off
|
||||
; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
|
||||
@ -719,7 +768,8 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off
|
||||
; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -736,9 +786,9 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off
|
||||
; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
|
||||
|
@ -13,14 +13,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
|
||||
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
@ -43,16 +44,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v2, v[4:5]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[4:5] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s0
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8
|
||||
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_med3_f32 v0, v0, v1, v2
|
||||
; VI-NEXT: flat_store_dword v[6:7], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -62,12 +64,13 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrs
|
||||
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
|
||||
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -98,19 +101,19 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2
|
||||
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
|
||||
; SI-NEXT: v_min_f32_e32 v5, v2, v3
|
||||
; SI-NEXT: v_max_f32_e32 v2, v2, v3
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
|
||||
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: v_min_f32_e32 v2, v2, v3
|
||||
@ -138,21 +141,21 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5]
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4
|
||||
; VI-NEXT: v_min_f32_e32 v5, v4, v2
|
||||
; VI-NEXT: v_max_f32_e32 v2, v4, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3
|
||||
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; VI-NEXT: v_min_f32_e32 v2, v2, v3
|
||||
@ -167,17 +170,17 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
|
||||
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
|
||||
@ -213,15 +216,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_sub_f32_e32 v2, s2, v2
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; SI-NEXT: v_sub_f32_e32 v2, s2, v2
|
||||
; SI-NEXT: v_sub_f32_e64 v4, s2, |v4|
|
||||
; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
||||
@ -245,17 +249,18 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5]
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_sub_f32_e32 v4, s2, v7
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_sub_f32_e64 v3, s2, |v3|
|
||||
; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
@ -266,13 +271,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
|
||||
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1
|
||||
; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3|
|
||||
; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
@ -310,17 +316,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_sub_f32_e64 v2, s2, |v2|
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_sub_f32_e64 v3, s2, |v3|
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; SI-NEXT: v_sub_f32_e64 v2, s2, |v2|
|
||||
; SI-NEXT: v_sub_f32_e64 v3, s2, |v3|
|
||||
; SI-NEXT: v_sub_f32_e64 v4, s2, |v4|
|
||||
; SI-NEXT: v_med3_f32 v2, v2, v3, v4
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
||||
@ -344,19 +350,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5]
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_sub_f32_e64 v4, s2, |v7|
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_sub_f32_e64 v2, s2, |v2|
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_sub_f32_e64 v3, s2, |v3|
|
||||
; VI-NEXT: v_med3_f32 v2, v4, v2, v3
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
@ -367,15 +373,15 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
|
||||
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1|
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2|
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0x80000000
|
||||
; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1|
|
||||
; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2|
|
||||
; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3|
|
||||
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
@ -415,16 +421,16 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_add_f32_e32 v3, 2.0, v3
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 2.0, v3
|
||||
; SI-NEXT: v_add_f32_e32 v4, 4.0, v4
|
||||
; SI-NEXT: v_min_f32_e32 v5, v2, v3
|
||||
; SI-NEXT: v_max_f32_e32 v2, v2, v3
|
||||
@ -454,20 +460,20 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5]
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_add_f32_e32 v4, 1.0, v7
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_add_f32_e32 v2, 2.0, v2
|
||||
; VI-NEXT: v_min_f32_e32 v5, v4, v2
|
||||
; VI-NEXT: v_max_f32_e32 v2, v4, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
|
||||
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; VI-NEXT: v_min_f32_e32 v2, v2, v3
|
||||
@ -482,16 +488,16 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out,
|
||||
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
|
||||
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
|
||||
@ -536,22 +542,23 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_mov_b32 s3, s11
|
||||
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
|
||||
; SI-NEXT: v_min_f32_e32 v5, v2, v3
|
||||
; SI-NEXT: v_max_f32_e32 v2, v2, v3
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
|
||||
; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: v_min_f32_e32 v2, v2, v3
|
||||
; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
|
||||
@ -578,20 +585,20 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5]
|
||||
; VI-NEXT: flat_load_dword v7, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v3, v[4:5] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; VI-NEXT: v_min_f32_e32 v5, v4, v2
|
||||
; VI-NEXT: v_max_f32_e32 v2, v4, v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3
|
||||
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; VI-NEXT: v_min_f32_e32 v2, v2, v3
|
||||
@ -599,6 +606,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
|
||||
; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2
|
||||
; VI-NEXT: v_max_f32_e32 v2, v3, v2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -607,17 +615,18 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(
|
||||
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: v_min_f32_e32 v4, v1, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v4, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
|
||||
|
@ -810,8 +810,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -874,8 +877,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, v1, s8
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v18, v18, v1, s9
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -954,9 +960,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[13:16], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_v:
|
||||
@ -1020,9 +1030,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, do
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[13:16], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1078,9 +1092,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GPRIDX-NEXT: s_set_gpr_idx_off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_s:
|
||||
@ -1121,9 +1139,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, do
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v3, v1
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1147,9 +1169,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GPRIDX-NEXT: s_set_gpr_idx_off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s:
|
||||
@ -1158,9 +1184,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double i
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v0, s2
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v1, s3
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1235,9 +1265,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_v:
|
||||
@ -1299,9 +1333,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, do
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1346,9 +1384,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[14:15]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[12:13]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_v:
|
||||
@ -1378,9 +1420,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double i
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s2, vcc_lo
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s3, vcc_lo
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1404,9 +1450,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, v17
|
||||
; GPRIDX-NEXT: s_set_gpr_idx_off
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s:
|
||||
@ -1415,9 +1465,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v0, v16
|
||||
; MOVREL-NEXT: v_movreld_b32_e32 v1, v17
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1460,9 +1514,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v:
|
||||
@ -1494,9 +1552,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v17, s6
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v17, s5
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%insert = insertelement <8 x double> %vec, double %val, i32 %idx
|
||||
@ -1978,24 +2040,25 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_nop 0
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_nop 0
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_nop 0
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GPRIDX-NEXT: v_mov_b32_e32 v3, s15
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_s_add_1:
|
||||
@ -2035,9 +2098,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %v
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v14, s14
|
||||
; MOVREL-NEXT: v_mov_b32_e32 v15, s15
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%idx.add = add i32 %idx, 1
|
||||
@ -2082,9 +2149,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[12:13]
|
||||
; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11]
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
|
||||
; GPRIDX-NEXT: s_endpgm
|
||||
;
|
||||
; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1:
|
||||
@ -2117,9 +2188,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v17, s6
|
||||
; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v17, s5
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
|
||||
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; MOVREL-NEXT: s_endpgm
|
||||
entry:
|
||||
%idx.add = add i32 %idx, 1
|
||||
|
@ -818,9 +818,12 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
|
||||
; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4
|
||||
; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8
|
||||
; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX7-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s0, 1, s0
|
||||
@ -829,7 +832,6 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1]
|
||||
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
|
||||
; GFX7-NEXT: s_endpgm
|
||||
@ -848,9 +850,12 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
|
||||
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 8, v1
|
||||
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
|
||||
; GFX8-NEXT: flat_load_dword v1, v[1:2]
|
||||
; GFX8-NEXT: flat_load_dword v2, v[3:4]
|
||||
; GFX8-NEXT: flat_load_dword v3, v[5:6]
|
||||
; GFX8-NEXT: flat_load_dword v1, v[1:2] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dword v2, v[3:4] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dword v3, v[5:6] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: s_add_u32 s0, s4, 8
|
||||
; GFX8-NEXT: s_addc_u32 s1, s5, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
@ -859,8 +864,7 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2
|
||||
; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3]
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: s_nop 0
|
||||
; GFX8-NEXT: s_nop 1
|
||||
; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
@ -874,16 +878,17 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
|
||||
; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x54
|
||||
; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10_W32-NEXT: s_clause 0x2
|
||||
; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7]
|
||||
; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4
|
||||
; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8
|
||||
; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7] glc dlc
|
||||
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc
|
||||
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8 glc dlc
|
||||
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10_W32-NEXT: s_and_b32 s0, 1, s0
|
||||
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
|
||||
; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
|
||||
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1
|
||||
; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8
|
||||
@ -896,16 +901,17 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
|
||||
; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x54
|
||||
; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10_W64-NEXT: s_clause 0x2
|
||||
; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7]
|
||||
; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4
|
||||
; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8
|
||||
; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7] glc dlc
|
||||
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4 glc dlc
|
||||
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8 glc dlc
|
||||
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10_W64-NEXT: s_and_b32 s0, 1, s0
|
||||
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
|
||||
; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1]
|
||||
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1
|
||||
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8
|
||||
|
@ -14,10 +14,11 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
|
||||
; GFX7-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2
|
||||
; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
@ -34,8 +35,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
|
||||
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
@ -48,9 +50,9 @@ define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
@ -79,10 +81,11 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
|
||||
; GFX7-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2
|
||||
; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
@ -99,8 +102,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
|
||||
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
@ -113,9 +117,9 @@ define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
@ -147,8 +151,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
|
||||
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
|
||||
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -168,8 +173,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
|
||||
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -183,9 +189,9 @@ define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, doubl
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
|
||||
; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
@ -217,8 +223,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
|
||||
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0
|
||||
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -238,8 +245,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
|
||||
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -253,9 +261,9 @@ define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, doubl
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
|
||||
; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
|
||||
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
@ -1056,12 +1064,12 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
|
||||
; GFX7-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1
|
||||
; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
@ -1078,11 +1086,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
|
||||
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
@ -1094,12 +1102,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out,
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
@ -1129,10 +1136,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
|
||||
; GFX7-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2
|
||||
; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
@ -1150,8 +1158,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
|
||||
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
|
||||
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
||||
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
|
||||
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
|
||||
@ -1165,9 +1174,9 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out,
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
|
||||
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
|
||||
|
@ -16,11 +16,13 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
|
||||
; GCN-NEXT: ; %bb.1: ; %mid
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: BB0_2: ; %bb
|
||||
; GCN-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg0, 0
|
||||
|
@ -15,10 +15,12 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
|
||||
; GCN-NEXT: ; %bb.1: ; %mid
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: BB0_2: ; %bb
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg0, 0
|
||||
|
@ -15,6 +15,7 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg0, 0
|
||||
|
@ -15,6 +15,7 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg0, 0
|
||||
|
@ -14,9 +14,10 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_load_dword s0, s[4:5], 0x11
|
||||
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1
|
||||
; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; CI-NEXT: flat_store_dword v[0:1], v0
|
||||
@ -27,10 +28,10 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) {
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
@ -59,6 +60,7 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
|
||||
; CI-NEXT: ; %bb.1: ; %bb0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CI-NEXT: flat_store_dword v[0:1], v0
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: BB1_2: ; %bb1
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -76,6 +78,7 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB1_2: ; %bb1
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%val = call i1 @llvm.amdgcn.is.private(i8* %ptr)
|
||||
|
@ -14,9 +14,10 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_load_dword s0, s[4:5], 0x10
|
||||
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1
|
||||
; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; CI-NEXT: flat_store_dword v[0:1], v0
|
||||
@ -27,10 +28,10 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) {
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s0, v1
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
@ -59,6 +60,7 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
|
||||
; CI-NEXT: ; %bb.1: ; %bb0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CI-NEXT: flat_store_dword v[0:1], v0
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: BB1_2: ; %bb1
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -76,6 +78,7 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB1_2: ; %bb1
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
|
||||
|
@ -32,6 +32,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
|
||||
; CI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_trig_preop_f64:
|
||||
@ -42,6 +43,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_trig_preop_f64:
|
||||
@ -52,6 +54,7 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
|
||||
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b)
|
||||
store volatile double %result, double* undef
|
||||
@ -65,6 +68,7 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
|
||||
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
%result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7)
|
||||
store volatile double %result, double* undef
|
||||
|
@ -17,17 +17,23 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB0_2: ; %Flow
|
||||
; GFX9-NEXT: s_xor_b32 s0, s0, -1
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
@ -36,16 +42,22 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
|
||||
; GFX9-NEXT: ; %bb.3: ; %bb0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB0_4: ; %bb2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -104,7 +116,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v0, v0, s[4:5]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB1_2: ; %Flow
|
||||
; GFX9-NEXT: s_xor_b32 s0, s0, -1
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
@ -123,7 +137,9 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB1_4: ; %bb2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -165,11 +181,13 @@ define void @localize_internal_globals(i1 %cond) {
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: global_store_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, static.gv3@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv3@rel32@hi+12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB2_2: ; %Flow
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
|
||||
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
|
||||
@ -180,11 +198,13 @@ define void @localize_internal_globals(i1 %cond) {
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: global_store_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_getpc_b64 s[6:7]
|
||||
; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB2_4: ; %bb2
|
||||
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -468,7 +468,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) {
|
||||
; GFX6-NEXT: s_mov_b32 s1, s3
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -478,7 +478,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(float addrspace(1)* inreg %ptr) {
|
||||
; GFX7-NEXT: s_mov_b32 s1, s3
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
||||
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%val = load volatile float, float addrspace(1)* %ptr
|
||||
@ -493,7 +493,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x3ffc
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -504,7 +504,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(float addrspace(1)* inreg
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x3ffc
|
||||
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4
|
||||
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4095
|
||||
@ -523,7 +523,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)*
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -537,7 +537,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(float addrspace(1)*
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296
|
||||
@ -556,7 +556,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)*
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -570,7 +570,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(float addrspace(1)*
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297
|
||||
@ -586,7 +586,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x4000
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -597,7 +597,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(float addrspace(1)* inreg
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x4000
|
||||
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4
|
||||
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4096
|
||||
@ -612,7 +612,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr)
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x3ffc
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -622,7 +622,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(float addrspace(1)* %ptr)
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x3ffc
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4095
|
||||
@ -637,7 +637,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)*
|
||||
; GFX6-NEXT: s_mov_b32 s1, 4
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, s0
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -647,7 +647,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(float addrspace(1)*
|
||||
; GFX7-NEXT: s_mov_b32 s1, 4
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, s0
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296
|
||||
@ -662,7 +662,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)*
|
||||
; GFX6-NEXT: s_mov_b32 s1, s0
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -672,7 +672,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(float addrspace(1)*
|
||||
; GFX7-NEXT: s_mov_b32 s1, s0
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297
|
||||
@ -687,7 +687,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr)
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x4000
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -697,7 +697,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(float addrspace(1)* %ptr)
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x4000
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i64 4096
|
||||
@ -716,7 +716,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -730,7 +730,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(float addrspace(1)* inre
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
|
||||
@ -745,7 +745,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
|
||||
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -755,7 +755,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(float addrspace(1)* %ptr
|
||||
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
|
||||
@ -770,7 +770,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
|
||||
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -780,7 +780,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
|
||||
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %soffset
|
||||
@ -802,7 +802,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
|
||||
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, s5
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -818,7 +818,7 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspac
|
||||
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_mov_b32 s2, s5
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256
|
||||
@ -836,7 +836,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inre
|
||||
; GFX6-NEXT: s_mov_b32 s1, s3
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -848,7 +848,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(float addrspace(1)* inre
|
||||
; GFX7-NEXT: s_mov_b32 s1, s3
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset
|
||||
@ -866,7 +866,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspa
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_movk_i32 s4, 0x3ffc
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -879,7 +879,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(float addrspa
|
||||
; GFX7-NEXT: s_mov_b32 s2, 0
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: s_movk_i32 s4, 0x3ffc
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset
|
||||
@ -896,7 +896,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
|
||||
; GFX6-NEXT: s_mov_b32 s6, 0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -908,7 +908,7 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
|
||||
; GFX7-NEXT: s_mov_b32 s6, 0
|
||||
; GFX7-NEXT: s_addc_u32 s5, s3, 0
|
||||
; GFX7-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
|
||||
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095
|
||||
|
@ -56,6 +56,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
||||
; GCN-NEXT: BB0_3: ; %bb.2
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
||||
entry:
|
||||
@ -129,6 +130,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
||||
; GCN-NEXT: BB1_2: ; %bb.1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg.cond, 0
|
||||
@ -193,9 +195,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; GCN-NEXT: s_mov_b32 s33, s8
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
|
||||
entry:
|
||||
@ -257,9 +259,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_sub_u32 s32, s32, 0x2000
|
||||
; GCN-NEXT: s_mov_b32 s33, s8
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg.cond, 0
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
; GCN-LABEL: {{^}}kernel_ieee_mode_default:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -17,7 +17,7 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
|
||||
|
||||
; GCN-LABEL: {{^}}kernel_ieee_mode_on:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -32,7 +32,7 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
|
||||
|
||||
; GCN-LABEL: {{^}}kernel_ieee_mode_off:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-NOT: [[VAL0]]
|
||||
; GCN-NOT: [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
|
||||
@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
|
||||
|
||||
; GCN-LABEL: {{^}}func_ieee_mode_default:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -62,7 +62,7 @@ define void @func_ieee_mode_default() #0 {
|
||||
|
||||
; GCN-LABEL: {{^}}func_ieee_mode_on:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -77,7 +77,7 @@ define void @func_ieee_mode_on() #1 {
|
||||
|
||||
; GCN-LABEL: {{^}}func_ieee_mode_off:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-NOT: [[VAL0]]
|
||||
; GCN-NOT: [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
|
||||
@ -92,7 +92,7 @@ define void @func_ieee_mode_off() #2 {
|
||||
|
||||
; GCN-LABEL: {{^}}cs_ieee_mode_default:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-NOT: [[VAL0]]
|
||||
; GCN-NOT: [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -107,7 +107,7 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
|
||||
|
||||
; GCN-LABEL: {{^}}cs_ieee_mode_on:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -122,7 +122,7 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
|
||||
|
||||
; GCN-LABEL: {{^}}cs_ieee_mode_off:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-NOT: [[VAL0]]
|
||||
; GCN-NOT: [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
|
||||
@ -137,7 +137,7 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
|
||||
|
||||
; GCN-LABEL: {{^}}ps_ieee_mode_default:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-NOT: [[VAL0]]
|
||||
; GCN-NOT: [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
|
||||
@ -152,7 +152,7 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
|
||||
|
||||
; GCN-LABEL: {{^}}ps_ieee_mode_on:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
|
||||
; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
|
||||
@ -167,7 +167,7 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
|
||||
|
||||
; GCN-LABEL: {{^}}ps_ieee_mode_off:
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
|
||||
; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN-NOT: [[VAL0]]
|
||||
; GCN-NOT: [[VAL1]]
|
||||
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
|
||||
|
@ -1,4 +1,3 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s
|
||||
|
||||
|
@ -245,7 +245,7 @@ bb3:
|
||||
|
||||
; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
|
||||
; GCN: buffer_store_dword [[BB4_K]]
|
||||
; GCN-NEXT: s_endpgm
|
||||
; GCN: s_endpgm
|
||||
; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
|
||||
define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
|
||||
bb0:
|
||||
|
@ -11,12 +11,12 @@
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 glc{{$}}
|
||||
; GCN-NOT: s32
|
||||
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
|
||||
; GCN-NOT: s32
|
||||
|
||||
; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
|
||||
; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16 glc{{$}}
|
||||
; GCN-NOT: s32
|
||||
; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
|
||||
; GCN-NOT: s32
|
||||
|
@ -87,8 +87,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
|
||||
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12
|
||||
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
@ -110,9 +108,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
|
||||
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
|
||||
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4
|
||||
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12
|
||||
|
||||
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
@ -57,10 +57,10 @@ define void @callee_with_stack() #0 {
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
|
||||
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}}
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
|
||||
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @callee_with_stack_no_fp_elim_all() #1 {
|
||||
%alloca = alloca i32, addrspace(5)
|
||||
@ -346,10 +346,10 @@ define void @no_new_vgpr_for_fp_csr() #1 {
|
||||
; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
|
||||
; MUBUF-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33
|
||||
; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x100000
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
|
||||
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @realign_stack_no_fp_elim() #1 {
|
||||
%alloca = alloca i32, align 8192, addrspace(5)
|
||||
@ -366,6 +366,7 @@ define void @realign_stack_no_fp_elim() #1 {
|
||||
; GCN: v_writelane_b32 v1, s31, 1
|
||||
; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
|
||||
; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN: ;;#ASMSTART
|
||||
; MUBUF: v_readlane_b32 s4, v1, 0
|
||||
; MUBUF-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -376,7 +377,6 @@ define void @realign_stack_no_fp_elim() #1 {
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x200
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
|
||||
; GCN-NEXT: v_readlane_b32 s33, v1, 2
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_setpc_b64 s[4:5]
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[0:1]
|
||||
define void @no_unused_non_csr_sgpr_for_fp() #1 {
|
||||
|
@ -399,9 +399,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
|
||||
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; VARABI: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
|
||||
; VARABI: s_waitcnt
|
||||
; VARABI-NEXT: s_setpc_b64
|
||||
; VARABI: s_setpc_b64
|
||||
|
||||
; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
|
||||
; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
|
||||
@ -545,7 +543,7 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
|
||||
; VARABI-NEXT: s_waitcnt
|
||||
; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
|
||||
; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; VARABI: buffer_load_dword v0, off, s[0:3], s32 glc{{$}}
|
||||
; VARABI: s_setpc_b64
|
||||
|
||||
|
||||
@ -554,7 +552,7 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
|
||||
|
||||
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4{{$}}
|
||||
; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; FIXEDABI: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_x_byval(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
@ -704,10 +702,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
|
||||
; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
|
||||
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
|
||||
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
|
||||
; VARABI: s_waitcnt
|
||||
; VARABI-NEXT: s_setpc_b64
|
||||
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
; VARABI: s_setpc_b64
|
||||
|
||||
|
||||
; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
|
||||
@ -717,7 +713,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
|
||||
; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
|
||||
; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
|
||||
; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
|
||||
|
||||
define void @too_many_args_use_workitem_id_xyz(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
@ -810,9 +806,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]]
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]
|
||||
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
; GCN: s_setpc_b64
|
||||
; GCN: ScratchSize: 0
|
||||
define void @too_many_args_use_workitem_id_x_stack_yz(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
|
@ -29,6 +29,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
|
||||
; GFX803-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_kern_stack:
|
||||
@ -39,6 +40,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1010-LABEL: test_kern_stack:
|
||||
@ -51,6 +53,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
%x = alloca i32, align 4, addrspace(5)
|
||||
@ -119,6 +122,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
|
||||
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
@ -134,6 +138,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
|
||||
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
@ -151,6 +156,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -190,6 +196,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
|
||||
; GFX803-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_force_fp_kern_stack:
|
||||
@ -201,6 +208,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1010-LABEL: test_force_fp_kern_stack:
|
||||
@ -214,6 +222,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
%x = alloca i32, align 4, addrspace(5)
|
||||
@ -286,6 +295,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
|
||||
; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX803-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
@ -302,6 +312,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
|
||||
; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX900-NEXT: s_movk_i32 s32, 0x400
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
@ -320,6 +331,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
|
||||
; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -336,10 +348,10 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
|
||||
; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
|
||||
; GFX803-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX803-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
|
||||
; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x40000
|
||||
; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
||||
; GFX803-NEXT: ;;#ASMSTART
|
||||
; GFX803-NEXT: ;;#ASMEND
|
||||
@ -347,6 +359,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
|
||||
; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_endpgm
|
||||
;
|
||||
; GFX900-LABEL: test_sgpr_offset_kernel:
|
||||
@ -355,9 +368,9 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
|
||||
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||
; GFX900-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX900-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
|
||||
; GFX900-NEXT: s_mov_b32 s6, 0x40000
|
||||
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_mov_b32 s6, 0x40000
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
|
||||
; GFX900-NEXT: ;;#ASMSTART
|
||||
; GFX900-NEXT: ;;#ASMEND
|
||||
@ -365,6 +378,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
|
||||
; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1010-LABEL: test_sgpr_offset_kernel:
|
||||
@ -376,7 +390,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
|
||||
; GFX1010-NEXT: s_add_u32 s0, s0, s7
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX1010-NEXT: s_mov_b32 s6, 0x20000
|
||||
; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
|
||||
; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
|
||||
; GFX1010-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
|
||||
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
|
||||
@ -386,6 +400,7 @@ define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
|
||||
; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
|
||||
; GFX1010-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
||||
; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1010-NEXT: s_endpgm
|
||||
entry:
|
||||
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
||||
|
@ -1,3 +1,4 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -codegenprepare -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
|
||||
@ -36,7 +37,8 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 2.0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28
|
||||
; GCN-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GCN-NEXT: global_load_dword v0, v[0:1], off glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: BB0_2: ; %endif
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000
|
||||
|
@ -137,7 +137,7 @@ done:
|
||||
; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
|
||||
; GCN: {{^}}BB4_2:
|
||||
define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
|
||||
entry:
|
||||
@ -177,7 +177,7 @@ done:
|
||||
; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
|
||||
; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
|
||||
; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}}
|
||||
; GCN: {{^BB[0-9]+}}_2:
|
||||
|
||||
define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
|
||||
@ -215,7 +215,7 @@ done:
|
||||
; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
|
||||
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}}
|
||||
; GCN: {{^BB[0-9]+}}_2:
|
||||
define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
|
||||
entry:
|
||||
|
@ -233,12 +233,15 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
|
||||
; GFX900-NEXT: global_load_ushort v0, v2, s[4:5]
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4
|
||||
; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(1)
|
||||
@ -262,14 +265,17 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
|
||||
; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1]
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
|
||||
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
||||
@ -387,10 +393,11 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
|
||||
; GCN-LABEL: chain_hi_to_lo_global_other_dep:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2
|
||||
; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
|
||||
; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -409,10 +416,12 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
|
||||
; GCN-LABEL: chain_hi_to_lo_flat_other_dep:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2
|
||||
; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1]
|
||||
; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
|
||||
; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -64,7 +64,7 @@ define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, fl
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @commute_add_fabs_f32
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
|
||||
; SI: buffer_store_dword [[REG]]
|
||||
@ -81,7 +81,7 @@ define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @commute_mul_fneg_f32
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
|
||||
; SI: buffer_store_dword [[REG]]
|
||||
@ -98,7 +98,7 @@ define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @commute_mul_fabs_fneg_f32
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
|
||||
; SI: buffer_store_dword [[REG]]
|
||||
@ -117,7 +117,7 @@ define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, f
|
||||
|
||||
; There's no reason to commute this.
|
||||
; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
|
||||
; SI: buffer_store_dword [[REG]]
|
||||
@ -135,7 +135,7 @@ define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %ou
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
|
||||
; SI: buffer_store_dword [[REG]]
|
||||
@ -157,7 +157,7 @@ define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)
|
||||
; though we have negate modifier on src2.
|
||||
|
||||
; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
|
||||
; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
|
||||
; SI: buffer_store_dword [[RESULT]]
|
||||
|
@ -567,7 +567,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
|
||||
; SI-NEXT: s_mov_b32 s5, s7
|
||||
; SI-NEXT: s_mov_b32 s6, s2
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -584,7 +584,7 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %
|
||||
; VI-NEXT: s_mov_b32 s5, s7
|
||||
; VI-NEXT: s_mov_b32 s6, s2
|
||||
; VI-NEXT: s_mov_b32 s7, s3
|
||||
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -612,9 +612,13 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)*
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: test_copy_v4i8_volatile_store:
|
||||
@ -635,9 +639,13 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)*
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
%val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
|
||||
store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
|
||||
|
@ -50,6 +50,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
|
||||
; CI-NEXT: s_mov_b32 s1, s0
|
||||
; CI-NEXT: ds_write_b32 v0, v2 offset:12
|
||||
; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
|
||||
@ -66,6 +67,7 @@ define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.v
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
entry:
|
||||
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -237,6 +239,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
|
||||
; CI-NEXT: s_mov_b32 s1, s0
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1
|
||||
; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
|
||||
@ -254,6 +257,7 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
%neg = sub i32 0, %x.i
|
||||
|
@ -50,10 +50,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -62,8 +63,9 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -92,12 +94,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
|
||||
; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
|
||||
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: s_waitcnt vmcnt(1)
|
||||
; CI-NEXT: ds_write_b32 v0, v2
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_write_b32 v0, v2
|
||||
; CI-NEXT: ds_write_b32 v0, v1 offset:32
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -106,11 +108,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: ds_write_b32 v0, v1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ds_write_b32 v0, v1
|
||||
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -138,12 +140,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
|
||||
; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
|
||||
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: s_waitcnt vmcnt(1)
|
||||
; CI-NEXT: ds_write_b32 v0, v2
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_write_b32 v0, v2
|
||||
; CI-NEXT: ds_write_b32 v0, v1 offset:32
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -152,11 +154,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: ds_write_b32 v0, v1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ds_write_b32 v0, v1
|
||||
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
|
||||
; GFX9-NEXT: s_endpgm
|
||||
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -186,11 +188,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
|
||||
; CI-NEXT: s_mov_b32 s2, 0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, 0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
|
||||
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8
|
||||
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -200,9 +203,9 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8
|
||||
; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -308,10 +311,11 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -320,8 +324,9 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -730,10 +735,11 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||||
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: s_mov_b32 m0, -1
|
||||
; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -742,8 +748,9 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
|
||||
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
|
||||
; GFX9-NEXT: s_endpgm
|
||||
|
@ -27,11 +27,11 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)*
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 glc{{$}}
|
||||
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
@ -56,10 +56,10 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
|
||||
@ -84,10 +84,10 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
|
||||
; CI-DAG: s_mov_b32 m0
|
||||
; GFX9-NOT: m0
|
||||
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
|
||||
|
@ -88,11 +88,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, 9
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: BB1_2: ; %bb1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, 10
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: s_add_co_br_user:
|
||||
@ -113,11 +115,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: BB1_2: ; %bb1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: s_add_co_br_user:
|
||||
@ -137,11 +141,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: BB1_2: ; %bb1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 10
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
bb:
|
||||
%i1 = add i32 %i, %i
|
||||
|
@ -35,7 +35,7 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}load_i16_zext_private:
|
||||
; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
|
||||
; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}}
|
||||
define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%tmp0 = alloca i16, addrspace(5)
|
||||
|
@ -106,10 +106,10 @@ define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s8, s2
|
||||
; GCN-NEXT: s_mov_b32 s9, s3
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -130,10 +130,10 @@ define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s8, s2
|
||||
; GCN-NEXT: s_mov_b32 s9, s3
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_store_dword v2, off, s[4:7], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -155,11 +155,11 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)*
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s8, s6
|
||||
; GCN-NEXT: s_mov_b32 s9, s7
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
||||
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
|
@ -231,12 +231,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
|
||||
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
|
||||
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
|
||||
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
|
||||
; GCN-SLOWFMA: v_add_f32_e32
|
||||
@ -267,12 +271,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
|
||||
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
|
||||
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
|
||||
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
|
||||
; GCN-SLOWFMA: v_add_f32_e32
|
||||
@ -305,12 +313,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
|
||||
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
|
||||
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
|
||||
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
|
||||
; GCN-SLOWFMA: v_add_f32_e32
|
||||
@ -344,12 +356,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #
|
||||
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
|
||||
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
|
||||
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
|
||||
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
|
||||
; GCN-SLOWFMA: v_add_f32_e32
|
||||
|
@ -201,8 +201,9 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
|
||||
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
|
||||
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
|
||||
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
|
||||
define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
|
||||
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
|
||||
%val = load volatile i8, i8* %fptr.offset
|
||||
@ -210,7 +211,9 @@ define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
|
||||
; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
|
||||
define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
|
||||
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
|
||||
%val = load volatile i8, i8* %fptr.offset
|
||||
@ -218,11 +221,11 @@ define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
|
||||
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
|
||||
; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
|
||||
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
|
||||
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
|
||||
define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
|
||||
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
|
||||
%val = load volatile i8, i8* %fptr.offset
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -16,9 +16,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
|
||||
|
||||
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
|
||||
@ -40,10 +40,10 @@ define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %ou
|
||||
|
||||
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
|
||||
; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
|
||||
; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
@ -73,9 +73,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalia
|
||||
|
||||
; (fadd x, (fmul y, z)) -> (fma y, z, x)
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
|
||||
@ -97,9 +97,9 @@ define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %ou
|
||||
|
||||
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
|
||||
@ -121,10 +121,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalia
|
||||
|
||||
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
|
||||
; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
|
||||
; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
@ -154,9 +154,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* n
|
||||
|
||||
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
|
||||
@ -178,10 +178,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalia
|
||||
|
||||
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
|
||||
; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
|
||||
; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
@ -211,9 +211,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* n
|
||||
|
||||
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
|
||||
; SI: buffer_store_dwordx2 [[RESULT]]
|
||||
define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
|
||||
@ -237,10 +237,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalia
|
||||
|
||||
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
|
||||
; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
|
||||
; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
@ -272,10 +272,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(
|
||||
|
||||
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
|
||||
; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
|
||||
; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
@ -308,11 +308,11 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(
|
||||
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
|
||||
|
||||
; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
|
||||
|
||||
; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
|
||||
; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
|
||||
@ -349,11 +349,11 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace
|
||||
; -> (fma (fneg y), z, (fma (fneg u), v, x))
|
||||
|
||||
; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
|
||||
; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24 glc{{$}}
|
||||
; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32 glc{{$}}
|
||||
|
||||
; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
|
||||
; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
|
||||
|
@ -4,7 +4,7 @@
|
||||
declare double @llvm.maxnum.f64(double, double) nounwind readnone
|
||||
|
||||
; SI-LABEL: {{^}}test_fmax3_f64:
|
||||
; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
|
||||
; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
|
||||
; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
|
||||
; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
|
||||
; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
|
||||
|
@ -37,7 +37,9 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp
|
||||
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -65,7 +67,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out
|
||||
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
|
||||
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -168,7 +172,9 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
|
||||
; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
|
||||
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
|
||||
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
|
||||
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -196,7 +202,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %
|
||||
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
|
||||
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
|
||||
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
|
||||
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -308,7 +316,9 @@ define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out
|
||||
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
|
||||
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -806,7 +816,9 @@ define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float
|
||||
; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
|
||||
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1042,7 +1054,9 @@ define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float
|
||||
; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
|
||||
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1111,7 +1125,9 @@ define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrsp
|
||||
; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1142,7 +1158,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out
|
||||
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
|
||||
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1349,7 +1367,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)*
|
||||
|
||||
; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
|
||||
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
|
||||
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
|
||||
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1434,7 +1454,9 @@ define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4
|
||||
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
|
||||
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1859,7 +1881,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float
|
||||
; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -1881,7 +1905,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
|
||||
; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
|
||||
; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -2316,7 +2342,9 @@ define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)
|
||||
; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
|
||||
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -2348,7 +2376,9 @@ define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %o
|
||||
; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
|
||||
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -2379,7 +2409,9 @@ define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %o
|
||||
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
|
||||
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -2418,7 +2450,9 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)*
|
||||
; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
|
||||
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -2453,7 +2487,9 @@ define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(
|
||||
; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
|
||||
|
||||
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
|
@ -107,8 +107,8 @@ define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
|
||||
|
||||
; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
|
||||
; GCN: s_waitcnt
|
||||
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
|
||||
; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}}
|
||||
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
|
||||
; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
|
||||
define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
|
||||
%val = load volatile i32, i32 addrspace(5)* %ptr
|
||||
ret void
|
||||
@ -162,11 +162,11 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
|
||||
; GCN: s_and_saveexec_b64
|
||||
|
||||
; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
|
||||
; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
|
||||
; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
|
||||
; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
|
||||
; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
|
||||
; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4{{$}}
|
||||
; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}
|
||||
|
||||
; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
|
||||
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
|
||||
|
@ -537,10 +537,10 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8,
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
|
||||
; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
|
||||
; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
|
||||
; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
|
||||
; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}}
|
||||
; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}}
|
||||
; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}}
|
||||
; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}}
|
||||
|
||||
; GCN: ds_write_b32 v0, v0
|
||||
; GCN: s_setpc_b64
|
||||
|
@ -159,7 +159,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
@ -168,7 +169,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
@ -190,7 +190,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -199,7 +200,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
@ -225,7 +225,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
@ -234,7 +235,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
@ -256,7 +256,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -265,7 +266,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
@ -350,7 +350,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: global_load_sbyte v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
@ -378,7 +379,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: global_load_sbyte v0, v[0:1], off
|
||||
; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -410,7 +412,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
@ -438,7 +441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
|
||||
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -529,7 +533,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
@ -557,7 +562,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
|
||||
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -589,7 +595,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
|
||||
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
|
||||
; GFX9-NEXT: s_mov_b32 s33, s32
|
||||
@ -617,7 +624,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
|
||||
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s4
|
||||
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
|
||||
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
|
||||
; GFX10-NEXT: s_mov_b32 s33, s32
|
||||
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
|
||||
@ -2877,6 +2885,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
|
||||
; GFX9-NEXT: v_mov_b32_e32 v41, v1
|
||||
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX9-NEXT: global_store_dword v[40:41], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v42, 0
|
||||
@ -2912,6 +2921,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
|
||||
; GFX10-NEXT: v_writelane_b32 v42, s31, 1
|
||||
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
||||
; GFX10-NEXT: global_store_dword v[40:41], v0, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4
|
||||
@ -2924,7 +2934,6 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
%val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
|
||||
store volatile i32 %val, i32 addrspace(1)* %out
|
||||
@ -3105,7 +3114,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
|
||||
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
|
||||
@ -3145,13 +3156,14 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
|
||||
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_byte v[0:1], v0, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
|
||||
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
||||
; GFX10-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_setpc_b64 s[4:5]
|
||||
%in.val = alloca { i8, i32 }, align 4, addrspace(5)
|
||||
%out.val = alloca { i8, i32 }, align 4, addrspace(5)
|
||||
|
@ -19,7 +19,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_f16_tfe_dmask0:
|
||||
@ -37,7 +39,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
|
||||
@ -55,7 +59,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { half, i32 } %v, 0
|
||||
@ -81,7 +87,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_f16_tfe_dmask1:
|
||||
@ -99,7 +107,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_short v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
|
||||
@ -117,7 +127,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { half, i32 } %v, 0
|
||||
@ -143,7 +155,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
|
||||
@ -161,7 +175,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
|
||||
@ -179,7 +195,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <2 x half>, i32 } %v, 0
|
||||
@ -205,7 +223,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
|
||||
@ -223,7 +243,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
|
||||
@ -241,7 +263,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <2 x half>, i32 } %v, 0
|
||||
@ -267,7 +291,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
|
||||
@ -285,7 +311,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
|
||||
@ -306,7 +334,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <2 x half>, i32 } %v, 0
|
||||
@ -333,8 +363,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_short v[0:1], v2, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v3f16_tfe_dmask7:
|
||||
@ -353,8 +386,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_short v[0:1], v2, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7:
|
||||
@ -375,9 +411,12 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
|
||||
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <3 x half>, i32 } %v, 0
|
||||
@ -404,7 +443,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
|
||||
; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
|
||||
@ -423,7 +464,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
|
||||
; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
|
||||
@ -448,7 +491,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
|
||||
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-UNPACKED-NEXT: s_endpgm
|
||||
%v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.data = extractvalue { <4 x half>, i32 } %v, 0
|
||||
|
@ -14,6 +14,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
|
||||
; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
|
||||
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
|
||||
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
|
||||
;
|
||||
; VI-LABEL: store_inline_imm_neg_0.0_i16:
|
||||
@ -24,6 +25,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
|
||||
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
|
||||
; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
|
||||
;
|
||||
; SI-LABEL: store_inline_imm_neg_0.0_i16:
|
||||
@ -34,6 +36,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x8000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
store volatile i16 -32768, i16 addrspace(1)* %out
|
||||
ret void
|
||||
|
@ -14,6 +14,7 @@ define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
|
||||
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_branch BB0_1
|
||||
; IR-LABEL: @infinite_loop(
|
||||
; IR-NEXT: entry:
|
||||
@ -45,6 +46,7 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
|
||||
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-NEXT: s_cbranch_vccnz BB1_2
|
||||
; SI-NEXT: BB1_3: ; %UnifiedReturnBlock
|
||||
@ -87,6 +89,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
|
||||
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-NEXT: s_cbranch_vccnz BB2_2
|
||||
; SI-NEXT: ; %bb.3: ; %Flow
|
||||
@ -105,6 +108,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
|
||||
; SI-NEXT: BB2_6: ; %loop1
|
||||
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-NEXT: s_cbranch_vccz BB2_6
|
||||
; SI-NEXT: BB2_7: ; %DummyReturnBlock
|
||||
@ -156,6 +160,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
|
||||
; SI-NEXT: s_cbranch_execnz BB3_3
|
||||
; SI-NEXT: ; %bb.4: ; %loop.exit.guard
|
||||
|
@ -245,6 +245,7 @@ entry:
|
||||
; FIXME: Should prodbably be masking high bits of load.
|
||||
; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
|
||||
; CHECK: buffer_load_ubyte v0
|
||||
; CHECK-NEXT: s_waitcnt
|
||||
; CHECK-NEXT: buffer_load_ubyte v1
|
||||
; CHECK-NEXT: s_waitcnt
|
||||
; CHECK-NEXT: ASMSTART
|
||||
|
@ -1577,7 +1577,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
|
||||
define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
|
||||
; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: global_load_dword v2, v[0:1], off
|
||||
; GFX9-NEXT: global_load_dword v2, v[0:1], off glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
|
||||
@ -1585,7 +1586,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
||||
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0xffff
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
|
||||
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
|
||||
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
|
||||
@ -1604,7 +1604,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s2, 0xffff
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -1614,7 +1615,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
||||
; VI-NEXT: s_lshl_b32 s0, s1, 16
|
||||
; VI-NEXT: s_or_b32 s0, s1, s0
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
|
||||
; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1632,7 +1632,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
|
||||
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; CI-NEXT: flat_load_dword v4, v[0:1]
|
||||
; CI-NEXT: flat_load_dword v4, v[0:1] glc
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
||||
; CI-NEXT: s_mov_b32 s3, 0
|
||||
; CI-NEXT: s_mov_b32 s2, 0xffff
|
||||
@ -1642,7 +1643,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
|
||||
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
|
||||
; CI-NEXT: s_or_b32 s0, s4, s1
|
||||
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; CI-NEXT: s_waitcnt vmcnt(1)
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
|
||||
; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -101,10 +101,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
|
||||
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -121,12 +122,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
|
||||
; VI-NEXT: flat_store_dword v[4:5], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -137,8 +139,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
@ -165,9 +168,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -180,11 +183,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
|
||||
; VI-NEXT: flat_store_dword v[2:3], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -194,7 +197,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
@ -219,9 +222,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -234,11 +237,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
|
||||
; VI-NEXT: flat_store_dword v[2:3], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -248,7 +251,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
@ -275,10 +278,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
|
||||
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -295,12 +299,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
|
||||
; VI-NEXT: flat_store_dword v[4:5], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -311,8 +316,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
@ -342,10 +348,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
|
||||
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -362,12 +369,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
|
||||
; VI-NEXT: flat_store_dword v[4:5], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -378,8 +386,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
@ -409,10 +418,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
|
||||
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -429,12 +439,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
|
||||
; VI-NEXT: flat_store_dword v[4:5], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -445,8 +456,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
@ -477,10 +489,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
|
||||
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -497,12 +510,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
|
||||
; VI-NEXT: flat_store_dword v[4:5], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -513,8 +527,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
|
@ -114,9 +114,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
|
||||
; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
|
||||
; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
|
||||
|
@ -22,13 +22,13 @@ define amdgpu_kernel void @maxnum_f16(
|
||||
; SI-NEXT: s_mov_b32 s15, s3
|
||||
; SI-NEXT: s_mov_b32 s10, s2
|
||||
; SI-NEXT: s_mov_b32 s11, s3
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
|
||||
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
|
||||
@ -50,13 +50,13 @@ define amdgpu_kernel void @maxnum_f16(
|
||||
; VI-NEXT: s_mov_b32 s15, s3
|
||||
; VI-NEXT: s_mov_b32 s10, s2
|
||||
; VI-NEXT: s_mov_b32 s11, s3
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_max_f16_e32 v0, v0, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_max_f16_e32 v1, v1, v1
|
||||
; VI-NEXT: v_max_f16_e32 v0, v0, v1
|
||||
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
@ -75,13 +75,13 @@ define amdgpu_kernel void @maxnum_f16(
|
||||
; GFX9-NEXT: s_mov_b32 s15, s3
|
||||
; GFX9-NEXT: s_mov_b32 s10, s2
|
||||
; GFX9-NEXT: s_mov_b32 s11, s3
|
||||
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
|
||||
; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
|
||||
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
|
@ -22,13 +22,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
|
||||
; SI-NEXT: s_mov_b32 s15, s3
|
||||
; SI-NEXT: s_mov_b32 s10, s2
|
||||
; SI-NEXT: s_mov_b32 s11, s3
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
|
||||
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
|
||||
@ -50,13 +50,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
|
||||
; VI-NEXT: s_mov_b32 s15, s3
|
||||
; VI-NEXT: s_mov_b32 s10, s2
|
||||
; VI-NEXT: s_mov_b32 s11, s3
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_max_f16_e32 v0, v0, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_max_f16_e32 v1, v1, v1
|
||||
; VI-NEXT: v_min_f16_e32 v0, v0, v1
|
||||
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
@ -75,13 +75,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
|
||||
; GFX9-NEXT: s_mov_b32 s15, s3
|
||||
; GFX9-NEXT: s_mov_b32 s10, s2
|
||||
; GFX9-NEXT: s_mov_b32 s11, s3
|
||||
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
|
||||
; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
|
||||
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
|
@ -536,13 +536,13 @@ entry:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
|
||||
; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe
|
||||
; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}}
|
||||
; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]] glc{{$}}
|
||||
; GFX900: s_waitcnt
|
||||
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: s_setpc_b64
|
||||
|
||||
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
|
||||
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
|
||||
@ -554,15 +554,15 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}}
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]] glc{{$}}
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: s_setpc_b64
|
||||
|
||||
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
|
||||
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
|
||||
entry:
|
||||
%load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
|
||||
@ -660,15 +660,15 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: s_setpc_b64
|
||||
|
||||
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
|
||||
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
|
||||
@ -681,15 +681,15 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}}
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]] glc{{$}}
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: s_setpc_b64
|
||||
|
||||
; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
|
||||
; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
|
||||
@ -702,15 +702,15 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-NEXT: s_setpc_b64
|
||||
|
||||
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
|
||||
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
|
||||
define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
|
||||
entry:
|
||||
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
|
||||
@ -805,9 +805,13 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
|
||||
; GFX900-MUBUF: buffer_store_dword
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR: scratch_store_dword
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
|
||||
entry:
|
||||
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
||||
@ -824,9 +828,13 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
|
||||
; GFX900-MUBUF: buffer_store_dword
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR: scratch_store_dword
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
|
||||
entry:
|
||||
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
||||
@ -844,9 +852,13 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
|
||||
; GFX900-MUBUF: buffer_store_dword
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR: scratch_store_dword
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
|
||||
entry:
|
||||
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
||||
@ -997,8 +1009,8 @@ entry:
|
||||
; FIXME: Is there a cost to using the extload over not?
|
||||
; GCN-LABEL: {{^}}load_private_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32{{$}}
|
||||
; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}}
|
||||
; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32 glc{{$}}
|
||||
; GFX900-FLATSCR: scratch_load_ushort v0, off, s32 glc{{$}}
|
||||
; GFX900-NEXT: s_waitcnt
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2
|
||||
|
@ -1333,7 +1333,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
||||
; GFX900-MUBUF: ; %bb.0: ; %entry
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1342,9 +1342,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1353,9 +1353,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1365,7 +1365,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1382,7 +1382,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
||||
; GFX900-MUBUF: ; %bb.0: ; %entry
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1391,9 +1391,9 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1402,9 +1402,9 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1414,7 +1414,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1431,7 +1431,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
|
||||
; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
||||
; GFX900-MUBUF: ; %bb.0: ; %entry
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1440,9 +1440,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
|
||||
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
@ -1452,9 +1452,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
|
||||
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1464,7 +1464,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v1, off, s0 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1582,7 +1582,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
||||
; GFX900-MUBUF: ; %bb.0: ; %entry
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1591,9 +1591,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1603,9 +1603,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
||||
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1615,7 +1615,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1633,7 +1633,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
|
||||
; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
||||
; GFX900-MUBUF: ; %bb.0: ; %entry
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1642,9 +1642,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
|
||||
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1653,9 +1653,9 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
|
||||
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1665,7 +1665,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v1, off, s0 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1683,7 +1683,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
||||
; GFX900-MUBUF: ; %bb.0: ; %entry
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1692,9 +1692,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
@ -1705,9 +1705,9 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX803: ; %bb.0: ; %entry
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
||||
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1717,7 +1717,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
|
||||
; GFX900-FLATSCR: ; %bb.0: ; %entry
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v1, off, s0 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v1, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1914,7 +1914,8 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1925,9 +1926,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1938,9 +1940,10 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
||||
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1951,7 +1954,8 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_short_d16 v0, off, s32 offset:4094 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1975,7 +1979,8 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1986,9 +1991,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -1999,9 +2005,10 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2012,7 +2019,8 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2037,7 +2045,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2048,9 +2057,10 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2061,10 +2071,11 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2075,7 +2086,8 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2100,7 +2112,8 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2111,9 +2124,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
@ -2125,9 +2139,10 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2138,7 +2153,8 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16 v0, off, s32 offset:4095 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2164,7 +2180,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2175,9 +2192,10 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
||||
@ -2189,10 +2207,11 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095 glc
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -2203,7 +2222,8 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x7b
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_dword off, v1, s32
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16 v0, off, s32 offset:4095 glc
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -30,27 +30,32 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000
|
||||
; MUBUF-NEXT: s_mov_b32 s6, 0
|
||||
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: BB0_1: ; %loadstoreloop
|
||||
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1
|
||||
; MUBUF-NEXT: s_add_i32 s6, s6, 1
|
||||
; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120
|
||||
; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_cbranch_scc1 BB0_1
|
||||
; MUBUF-NEXT: ; %bb.2: ; %split
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000
|
||||
; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3
|
||||
; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3
|
||||
; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
|
||||
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_endpgm
|
||||
;
|
||||
; FLATSCR-LABEL: local_stack_offset_uses_sp:
|
||||
@ -62,25 +67,29 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
|
||||
; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000
|
||||
; FLATSCR-NEXT: s_mov_b32 s2, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: BB0_1: ; %loadstoreloop
|
||||
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; FLATSCR-NEXT: s_add_u32 s3, 0x3000, s2
|
||||
; FLATSCR-NEXT: s_add_i32 s2, s2, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v0, s3
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB0_1
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %split
|
||||
; FLATSCR-NEXT: s_movk_i32 s2, 0x2000
|
||||
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
|
||||
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
|
||||
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_endpgm
|
||||
entry:
|
||||
%pin.low = alloca i32, align 8192, addrspace(5)
|
||||
@ -111,26 +120,30 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
|
||||
; MUBUF-NEXT: s_mov_b32 s4, 0
|
||||
; MUBUF-NEXT: s_add_u32 s32, s32, 0x180000
|
||||
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s33
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: BB1_1: ; %loadstoreloop
|
||||
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3
|
||||
; MUBUF-NEXT: s_add_i32 s4, s4, 1
|
||||
; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
|
||||
; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_cbranch_scc1 BB1_1
|
||||
; MUBUF-NEXT: ; %bb.2: ; %split
|
||||
; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33
|
||||
; MUBUF-NEXT: v_add_u32_e32 v3, 0x1000, v3
|
||||
; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3
|
||||
; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen
|
||||
; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4
|
||||
; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen offset:4 glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:4 glc
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x180000
|
||||
; MUBUF-NEXT: s_mov_b32 s33, s5
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(1)
|
||||
; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v5
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
|
||||
; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
@ -146,6 +159,7 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: s_add_u32 s32, s32, 0x6000
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v2, s33
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: BB1_1: ; %loadstoreloop
|
||||
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; FLATSCR-NEXT: s_add_u32 vcc_hi, s33, 0x1000
|
||||
@ -153,17 +167,19 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
|
||||
; FLATSCR-NEXT: s_add_i32 s0, s0, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v2, s1
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 BB1_1
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %split
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
|
||||
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s2
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
|
||||
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
|
||||
; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
|
||||
|
@ -17,9 +17,9 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
|
||||
|
||||
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
|
||||
; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
|
||||
|
||||
@ -52,10 +52,10 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out
|
||||
|
||||
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
|
||||
; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
|
||||
; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
|
||||
@ -97,9 +97,9 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias
|
||||
|
||||
; (fadd x, (fmul y, z)) -> (fma y, z, x)
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
|
||||
; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
|
||||
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
|
||||
@ -128,9 +128,9 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out
|
||||
|
||||
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
|
||||
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
|
||||
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
|
||||
@ -158,10 +158,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias
|
||||
|
||||
; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
|
||||
@ -200,9 +200,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no
|
||||
|
||||
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
|
||||
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
|
||||
; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
|
||||
@ -230,10 +230,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias
|
||||
|
||||
; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
|
||||
@ -272,9 +272,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no
|
||||
|
||||
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
|
||||
; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
|
||||
|
||||
@ -305,10 +305,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias
|
||||
|
||||
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
|
||||
@ -349,10 +349,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1
|
||||
|
||||
; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
|
||||
; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
|
||||
; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
|
||||
@ -394,11 +394,11 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1
|
||||
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
|
||||
|
||||
; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
|
||||
; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
|
||||
; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
|
||||
@ -436,11 +436,11 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(
|
||||
; -> (fma (fneg y), z, (fma (fneg u), v, x))
|
||||
|
||||
; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
|
||||
; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
|
||||
; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
|
||||
@ -478,11 +478,11 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(
|
||||
; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
|
||||
|
||||
; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
|
||||
; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
|
||||
; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
|
||||
@ -529,11 +529,11 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(
|
||||
; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
|
||||
|
||||
; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16 glc{{$}}
|
||||
|
||||
; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
|
||||
; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
|
||||
|
@ -120,8 +120,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}}
|
||||
; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}}
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}}
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
|
||||
%src0.ext = fpext half %src0 to float
|
||||
|
@ -41,7 +41,7 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
|
||||
; GCN-LABEL: {{^}}madak_2_use_f32:
|
||||
; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
|
||||
; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
|
||||
; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]],
|
||||
|
@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||
declare float @llvm.fabs.f32(float) nounwind readnone
|
||||
|
||||
; GCN-LABEL: {{^}}madmk_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
|
||||
define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
|
||||
@ -28,7 +28,7 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}madmk_2_use_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
|
||||
@ -61,7 +61,7 @@ define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, flo
|
||||
|
||||
; We don't get any benefit if the constant is an inline immediate.
|
||||
; GCN-LABEL: {{^}}madmk_inline_imm_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
|
||||
define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
|
||||
@ -128,7 +128,7 @@ define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[SK]], [[VB]]
|
||||
@ -150,7 +150,7 @@ define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalia
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32:
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{[sv][0-9]+}}, |{{v[0-9]+}}|
|
||||
define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
|
||||
|
396
test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Normal file
396
test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Normal file
@ -0,0 +1,396 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
|
||||
|
||||
define amdgpu_kernel void @flat_nontemporal_load_0(
|
||||
; GFX7-LABEL: flat_nontemporal_load_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_nontemporal_load_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32* %in, i32* %out) {
|
||||
entry:
|
||||
%val = load volatile i32, i32* %in, align 4
|
||||
store i32 %val, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_nontemporal_load_1(
|
||||
; GFX7-LABEL: flat_nontemporal_load_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
|
||||
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX7-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
|
||||
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
|
||||
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_nontemporal_load_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
|
||||
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
|
||||
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_nontemporal_load_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
||||
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v2, vcc, s0, v2
|
||||
; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32* %in, i32* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
|
||||
%val = load volatile i32, i32* %val.gep, align 4
|
||||
store i32 %val, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_nontemporal_store_0(
|
||||
; GFX7-LABEL: flat_nontemporal_store_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_nontemporal_store_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32* %in, i32* %out) {
|
||||
entry:
|
||||
%val = load i32, i32* %in, align 4
|
||||
store volatile i32 %val, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_nontemporal_store_1(
|
||||
; GFX7-LABEL: flat_nontemporal_store_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-NEXT: flat_load_dword v2, v[1:2]
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||||
; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
|
||||
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_nontemporal_store_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
|
||||
; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
|
||||
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
|
||||
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[1:2]
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; SKIP-CACHE-INV-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32* %in, i32* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val = load i32, i32* %in, align 4
|
||||
%out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
|
||||
store volatile i32 %val, i32* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
|
||||
; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_acquire_load:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1]
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32* %in, i32* %out) {
|
||||
entry:
|
||||
%val = load atomic volatile i32, i32* %in syncscope("workgroup") acquire, align 4
|
||||
store i32 %val, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @flat_volatile_workgroup_release_store(
|
||||
; GFX7-LABEL: flat_volatile_workgroup_release_store:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: flat_volatile_workgroup_release_store:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 %in, i32* %out) {
|
||||
entry:
|
||||
store atomic volatile i32 %in, i32* %out syncscope("workgroup") release, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
458
test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
Normal file
458
test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
Normal file
@ -0,0 +1,458 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
|
||||
|
||||
define amdgpu_kernel void @global_volatile_load_0(
|
||||
; GFX6-LABEL: global_volatile_load_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s0, s4
|
||||
; GFX6-NEXT: s_mov_b32 s1, s5
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s4, s6
|
||||
; GFX6-NEXT: s_mov_b32 s5, s7
|
||||
; GFX6-NEXT: s_mov_b32 s6, s2
|
||||
; GFX6-NEXT: s_mov_b32 s7, s3
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: global_volatile_load_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: global_volatile_load_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_volatile_load_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_volatile_load_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
|
||||
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%val = load volatile i32, i32 addrspace(1)* %in, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_volatile_load_1(
|
||||
; GFX6-LABEL: global_volatile_load_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s0, s6
|
||||
; GFX6-NEXT: s_mov_b32 s1, s7
|
||||
; GFX6-NEXT: s_mov_b32 s6, 0
|
||||
; GFX6-NEXT: s_mov_b32 s7, s3
|
||||
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: global_volatile_load_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
|
||||
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX7-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: global_volatile_load_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[2:3]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_volatile_load_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v1, v0, s[2:3]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_volatile_load_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
|
||||
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
|
||||
%val = load volatile i32, i32 addrspace(1)* %val.gep, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_volatile_store_0(
|
||||
; GFX6-LABEL: global_volatile_store_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX6-NEXT: s_mov_b32 s4, s2
|
||||
; GFX6-NEXT: s_mov_b32 s5, s3
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: global_volatile_store_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: global_volatile_store_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_volatile_store_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_volatile_store_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%val = load i32, i32 addrspace(1)* %in, align 4
|
||||
store volatile i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_volatile_store_1(
|
||||
; GFX6-LABEL: global_volatile_store_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s6, 0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: global_volatile_store_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: global_volatile_store_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_volatile_store_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_volatile_store_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val = load i32, i32 addrspace(1)* %in, align 4
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
|
||||
store volatile i32 %val, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
|
||||
; GFX6-LABEL: global_volatile_workgroup_acquire_load:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s0, s4
|
||||
; GFX6-NEXT: s_mov_b32 s1, s5
|
||||
; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_mov_b32 s4, s6
|
||||
; GFX6-NEXT: s_mov_b32 s5, s7
|
||||
; GFX6-NEXT: s_mov_b32 s6, s2
|
||||
; GFX6-NEXT: s_mov_b32 s7, s3
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: global_volatile_workgroup_acquire_load:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1]
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
|
||||
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @global_volatile_workgroup_release_store(
|
||||
; GFX6-LABEL: global_volatile_workgroup_release_store:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: global_volatile_workgroup_release_store:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: global_volatile_workgroup_release_store:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: global_volatile_workgroup_release_store:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
440
test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
Normal file
440
test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
Normal file
@ -0,0 +1,440 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
|
||||
|
||||
define amdgpu_kernel void @local_volatile_load_0(
|
||||
; GFX6-LABEL: local_volatile_load_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: ds_read_b32 v0, v0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_load_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_load_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_load_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: ds_read_b32 v0, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_load_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%val = load volatile i32, i32 addrspace(3)* %in, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX6-LABEL: local_volatile_load_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: ds_read_b32 v0, v0
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_load_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_load_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_load_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-CU-NEXT: ds_read_b32 v0, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_load_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0
|
||||
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
|
||||
%val = load volatile i32, i32 addrspace(3)* %val.gep, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_volatile_store_0(
|
||||
; GFX6-LABEL: local_volatile_store_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b32 v0, v1
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_store_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b32 v0, v1
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_store_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_store_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: ds_write_b32 v0, v1
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_store_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
|
||||
entry:
|
||||
%val = load i32, i32 addrspace(1)* %in, align 4
|
||||
store volatile i32 %val, i32 addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_volatile_store_1(
|
||||
; GFX6-LABEL: local_volatile_store_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: ds_write_b32 v0, v1
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_store_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: ds_write_b32 v0, v1
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_store_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_store_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: ds_write_b32 v0, v1
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_store_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val = load i32, i32 addrspace(1)* %in, align 4
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
|
||||
store volatile i32 %val, i32 addrspace(3)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
; GFX6-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: ds_read_b32 v0, v0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: ds_write_b32 v1, v0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: ds_read_b32 v0, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: ds_write_b32 v1, v0
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v0, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: buffer_gl0_inv
|
||||
; GFX10-WGP-NEXT: ds_write_b32 v1, v0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-CU-NEXT: ds_read_b32 v0, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: ds_write_b32 v1, v0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
|
||||
entry:
|
||||
%val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
|
||||
store i32 %val, i32 addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX6-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: ds_write_b32 v0, v1
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: ds_write_b32 v0, v1
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: ds_write_b32 v0, v1
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: ds_write_b32 v0, v1
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 %in, i32 addrspace(3)* %out) {
|
||||
entry:
|
||||
store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
404
test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
Normal file
404
test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
Normal file
@ -0,0 +1,404 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
|
||||
|
||||
define amdgpu_kernel void @private_volatile_load_0(
|
||||
; GFX6-LABEL: private_volatile_load_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_mov_b32 s10, -1
|
||||
; GFX6-NEXT: s_mov_b32 s11, 0xe8f000
|
||||
; GFX6-NEXT: s_add_u32 s8, s8, s3
|
||||
; GFX6-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: private_volatile_load_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX7-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: private_volatile_load_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: private_volatile_load_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: private_volatile_load_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9]
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%val = load volatile i32, i32 addrspace(5)* %in, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX6-LABEL: private_volatile_load_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||
; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_mov_b32 s10, -1
|
||||
; GFX6-NEXT: s_mov_b32 s11, 0xe8f000
|
||||
; GFX6-NEXT: s_add_u32 s8, s8, s3
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX6-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
|
||||
; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: private_volatile_load_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
||||
; GFX7-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX7-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: private_volatile_load_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: private_volatile_load_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: private_volatile_load_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9]
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0
|
||||
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0
|
||||
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
|
||||
%val = load volatile i32, i32 addrspace(5)* %val.gep, align 4
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @private_volatile_store_0(
|
||||
; GFX6-LABEL: private_volatile_store_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
|
||||
; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xe8f000
|
||||
; GFX6-NEXT: s_add_u32 s4, s4, s3
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; GFX6-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX6-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: private_volatile_store_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
||||
; GFX7-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX7-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: private_volatile_store_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: private_volatile_store_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
|
||||
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: private_volatile_store_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
|
||||
entry:
|
||||
%val = load i32, i32 addrspace(1)* %in, align 4
|
||||
store volatile i32 %val, i32 addrspace(5)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @private_volatile_store_1(
|
||||
; GFX6-LABEL: private_volatile_store_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
|
||||
; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
|
||||
; GFX6-NEXT: s_mov_b32 s6, -1
|
||||
; GFX6-NEXT: s_mov_b32 s7, 0xe8f000
|
||||
; GFX6-NEXT: s_add_u32 s4, s4, s3
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX6-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: private_volatile_store_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
||||
; GFX7-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX7-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: private_volatile_store_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-WGP-NEXT: s_clause 0x1
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
|
||||
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-CU-LABEL: private_volatile_store_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
|
||||
; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
|
||||
; GFX10-CU-NEXT: s_clause 0x1
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
||||
; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
|
||||
; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
|
||||
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: private_volatile_store_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_endpgm
|
||||
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val = load i32, i32 addrspace(1)* %in, align 4
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
|
||||
store volatile i32 %val, i32 addrspace(5)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
@ -154,8 +154,10 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v3, v0, v1
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v1
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v3, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v2
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
|
||||
; GFX9-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
|
@ -234,8 +234,8 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
|
||||
; GCN-NEXT: ; %bb.8: ; %case0
|
||||
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], 0
|
||||
; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
|
||||
; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
|
||||
; GCN-NEXT: s_and_b64 s[10:11], vcc, exec
|
||||
|
@ -186,6 +186,7 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
|
||||
; GCN-NEXT: BB1_6: ; %bb31
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
; IR-LABEL: @nested_loop_conditions(
|
||||
; IR-NEXT: bb:
|
||||
|
@ -48,6 +48,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
||||
; MUBUF-NEXT: BB0_3: ; %bb.2
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
|
||||
; MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_endpgm
|
||||
;
|
||||
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
|
||||
@ -83,6 +84,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
||||
; FLATSCR-NEXT: BB0_3: ; %bb.2
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_endpgm
|
||||
|
||||
entry:
|
||||
@ -151,6 +153,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
||||
; MUBUF-NEXT: BB1_2: ; %bb.1
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
|
||||
; MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_endpgm
|
||||
;
|
||||
; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
|
||||
@ -181,6 +184,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
|
||||
; FLATSCR-NEXT: BB1_2: ; %bb.1
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_endpgm
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg.cond, 0
|
||||
@ -243,9 +247,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
||||
; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
|
||||
; MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x400
|
||||
; MUBUF-NEXT: s_mov_b32 s33, s7
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
|
||||
@ -280,9 +284,9 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
|
||||
; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 16
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s5
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
|
||||
entry:
|
||||
@ -342,9 +346,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
||||
; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
|
||||
; MUBUF-NEXT: global_store_dword v[0:1], v0, off
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_sub_u32 s32, s32, 0x2000
|
||||
; MUBUF-NEXT: s_mov_b32 s33, s7
|
||||
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
|
||||
@ -374,9 +378,9 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
|
||||
; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x80
|
||||
; FLATSCR-NEXT: s_mov_b32 s33, s3
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%cond = icmp eq i32 %arg.cond, 0
|
||||
|
@ -34,7 +34,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 {
|
||||
; GCN-LABEL: {{^}}load_from_undef:
|
||||
; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
|
||||
; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
|
||||
; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}}
|
||||
; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen glc{{$}}
|
||||
define amdgpu_kernel void @load_from_undef() #0 {
|
||||
%ld = load volatile i32, i32 addrspace(5)* undef
|
||||
ret void
|
||||
@ -43,7 +43,7 @@ define amdgpu_kernel void @load_from_undef() #0 {
|
||||
; GCN-LABEL: {{^}}load_from_inttoptr:
|
||||
; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
|
||||
; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
|
||||
; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}}
|
||||
; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124 glc{{$}}
|
||||
define amdgpu_kernel void @load_from_inttoptr() #0 {
|
||||
%ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*)
|
||||
ret void
|
||||
|
@ -44,7 +44,7 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float
|
||||
; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
|
||||
|
||||
; SI-LABEL: @rsqrt_fmul
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
|
||||
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
|
||||
|
||||
|
@ -24,19 +24,19 @@ define amdgpu_kernel void @select_f16(
|
||||
; SI-NEXT: s_mov_b32 s11, s3
|
||||
; SI-NEXT: s_mov_b32 s14, s2
|
||||
; SI-NEXT: s_mov_b32 s15, s3
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0
|
||||
; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_waitcnt vmcnt(3)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
@ -65,15 +65,17 @@ define amdgpu_kernel void @select_f16(
|
||||
; VI-NEXT: s_mov_b32 s11, s3
|
||||
; VI-NEXT: s_mov_b32 s14, s2
|
||||
; VI-NEXT: s_mov_b32 s15, s3
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0
|
||||
; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -112,16 +114,16 @@ define amdgpu_kernel void @select_f16_imm_a(
|
||||
; SI-NEXT: s_mov_b32 s19, s11
|
||||
; SI-NEXT: s_mov_b32 s6, s10
|
||||
; SI-NEXT: s_mov_b32 s7, s11
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
||||
@ -147,14 +149,15 @@ define amdgpu_kernel void @select_f16_imm_a(
|
||||
; VI-NEXT: s_mov_b32 s19, s11
|
||||
; VI-NEXT: s_mov_b32 s6, s10
|
||||
; VI-NEXT: s_mov_b32 s7, s11
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s8, s0
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -191,16 +194,16 @@ define amdgpu_kernel void @select_f16_imm_b(
|
||||
; SI-NEXT: s_mov_b32 s19, s11
|
||||
; SI-NEXT: s_mov_b32 s6, s10
|
||||
; SI-NEXT: s_mov_b32 s7, s11
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
||||
@ -226,14 +229,15 @@ define amdgpu_kernel void @select_f16_imm_b(
|
||||
; VI-NEXT: s_mov_b32 s19, s11
|
||||
; VI-NEXT: s_mov_b32 s6, s10
|
||||
; VI-NEXT: s_mov_b32 s7, s11
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s8, s0
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(2)
|
||||
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -270,16 +274,16 @@ define amdgpu_kernel void @select_f16_imm_c(
|
||||
; SI-NEXT: s_mov_b32 s19, s11
|
||||
; SI-NEXT: s_mov_b32 s6, s10
|
||||
; SI-NEXT: s_mov_b32 s7, s11
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
|
||||
@ -305,15 +309,16 @@ define amdgpu_kernel void @select_f16_imm_c(
|
||||
; VI-NEXT: s_mov_b32 s19, s11
|
||||
; VI-NEXT: s_mov_b32 s6, s10
|
||||
; VI-NEXT: s_mov_b32 s7, s11
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
|
||||
; VI-NEXT: s_mov_b32 s8, s0
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -350,16 +355,16 @@ define amdgpu_kernel void @select_f16_imm_d(
|
||||
; SI-NEXT: s_mov_b32 s19, s11
|
||||
; SI-NEXT: s_mov_b32 s6, s10
|
||||
; SI-NEXT: s_mov_b32 s7, s11
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
|
||||
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
|
||||
@ -385,15 +390,16 @@ define amdgpu_kernel void @select_f16_imm_d(
|
||||
; VI-NEXT: s_mov_b32 s19, s11
|
||||
; VI-NEXT: s_mov_b32 s6, s10
|
||||
; VI-NEXT: s_mov_b32 s7, s11
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
|
||||
; VI-NEXT: s_mov_b32 s8, s0
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
|
@ -294,11 +294,12 @@ define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 a
|
||||
; GCN-NEXT: s_mov_b32 s14, 0
|
||||
; GCN-NEXT: s_mov_b32 s15, s3
|
||||
; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
|
||||
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0
|
||||
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2
|
||||
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
|
||||
@ -1640,6 +1641,7 @@ define amdgpu_kernel void @test_mul2(i32 %p) {
|
||||
; GCN-NEXT: s_lshl_b32 s0, s0, 1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: test_mul2:
|
||||
|
@ -82,15 +82,17 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
|
||||
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
|
||||
; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_i32_x_sub_64_multi_use:
|
||||
@ -101,17 +103,19 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v3, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v3, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_store_dword v[0:1], v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
|
||||
@ -119,14 +123,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
|
||||
@ -134,15 +140,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
|
||||
; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
@ -804,15 +811,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
|
||||
; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; SI-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2
|
||||
; SI-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3
|
||||
; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_i16_x_sub_64_multi_use:
|
||||
@ -823,17 +832,19 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_ushort v3, v[0:1]
|
||||
; VI-NEXT: flat_load_ushort v4, v[0:1]
|
||||
; VI-NEXT: flat_load_ushort v3, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_ushort v4, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
|
||||
; VI-NEXT: flat_store_short v[0:1], v2
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_store_short v[0:1], v3
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
|
||||
@ -841,14 +852,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
|
||||
; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
|
||||
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
|
||||
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2
|
||||
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_store_short v0, v2, s[0:1]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
|
||||
@ -856,15 +869,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
|
||||
; GFX10-NEXT: global_load_ushort v2, v0, s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64
|
||||
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64
|
||||
; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 64
|
||||
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: global_store_short v0, v2, s[0:1]
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
|
@ -212,6 +212,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
|
||||
; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
|
||||
; SI-NEXT: s_mov_b64 s[12:13], 0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_branch BB3_1
|
||||
; SI-NEXT: BB3_8: ; %loop.exit.guard4
|
||||
; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
|
||||
@ -284,6 +285,7 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
|
||||
; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
|
||||
; FLAT-NEXT: s_mov_b64 s[12:13], 0
|
||||
; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; FLAT-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLAT-NEXT: s_branch BB3_1
|
||||
; FLAT-NEXT: BB3_8: ; %loop.exit.guard4
|
||||
; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
|
||||
|
@ -351,15 +351,16 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a)
|
||||
; SI-NEXT: s_sext_i32_i8 s0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_sext_v4i8_to_v4i32:
|
||||
@ -376,11 +377,15 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a)
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
%cast = bitcast i32 %a to <4 x i8>
|
||||
%ext = sext <4 x i8> %cast to <4 x i32>
|
||||
@ -417,9 +422,13 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
|
||||
; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
|
||||
; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_sext_v4i8_to_v4i32:
|
||||
@ -442,9 +451,13 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
||||
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
%a = load i32, i32 addrspace(1)* %in
|
||||
%cast = bitcast i32 %a to <4 x i8>
|
||||
@ -475,16 +488,17 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
|
||||
; SI-NEXT: s_sext_i32_i16 s6, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: s_sext_i32_i16 s7, s7
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_sext_v4i16_to_v4i32:
|
||||
@ -500,13 +514,17 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; VI-NEXT: s_ashr_i32 s4, s7, 16
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; VI-NEXT: s_sext_i32_i16 s7, s7
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
%cast = bitcast i64 %a to <4 x i16>
|
||||
%ext = sext <4 x i16> %cast to <4 x i32>
|
||||
@ -541,9 +559,13 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
|
||||
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_sext_v4i16_to_v4i32:
|
||||
@ -565,9 +587,13 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
|
||||
; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
|
||||
; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
%a = load i64, i64 addrspace(1)* %in
|
||||
%cast = bitcast i64 %a to <4 x i16>
|
||||
|
@ -13,6 +13,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 9
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
; VI-NEXT: .section .rodata,#alloc
|
||||
; VI-NEXT: .p2align 6
|
||||
@ -59,6 +60,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 9
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
; GFX9-NEXT: .section .rodata,#alloc
|
||||
; GFX9-NEXT: .p2align 6
|
||||
@ -112,6 +114,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 9
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
; VI-NEXT: .section .rodata,#alloc
|
||||
; VI-NEXT: .p2align 6
|
||||
@ -158,6 +161,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 9
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
; GFX9-NEXT: .section .rodata,#alloc
|
||||
; GFX9-NEXT: .p2align 6
|
||||
@ -211,6 +215,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 9
|
||||
; VI-NEXT: s_mov_b32 flat_scratch_lo, s5
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_endpgm
|
||||
; VI-NEXT: .section .rodata,#alloc
|
||||
; VI-NEXT: .p2align 6
|
||||
@ -257,6 +262,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 9
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_endpgm
|
||||
; GFX9-NEXT: .section .rodata,#alloc
|
||||
; GFX9-NEXT: .p2align 6
|
||||
|
@ -165,6 +165,7 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
|
||||
; GCN: s_mov_b32 s34, s32
|
||||
; GCN: v_mov_b32_e32 v32, 0
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
|
||||
; GCN-NEXT: s_add_u32 s32, s32, 0x30000
|
||||
|
||||
@ -237,10 +238,10 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
|
||||
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
|
||||
; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
; GCN: s_setpc_b64 s[30:31]
|
||||
%local_val = alloca i32, align 128, addrspace(5)
|
||||
store volatile i32 %b, i32 addrspace(5)* %local_val, align 128
|
||||
; Use all clobberable registers, so BP has to spill to a VGPR.
|
||||
|
@ -646,9 +646,13 @@ entry:
|
||||
; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF: buffer_store_dword
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR: scratch_store_dword
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
|
||||
entry:
|
||||
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
||||
@ -665,9 +669,12 @@ entry:
|
||||
; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
|
||||
; GCN: s_waitcnt
|
||||
; GFX900-MUBUF: buffer_store_dword
|
||||
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-MUBUF-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
|
||||
; GFX900-FLATSCR: scratch_store_dword
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095
|
||||
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
|
||||
entry:
|
||||
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
||||
|
@ -13,10 +13,11 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[8:9]
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[8:9] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -33,11 +34,12 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_e32 v2, v0, v1
|
||||
; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
@ -163,11 +165,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -180,11 +182,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
@ -208,11 +210,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -225,11 +227,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x3df
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
@ -252,10 +254,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -268,11 +270,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 1
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v2, 1, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
@ -295,10 +297,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -311,10 +313,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
|
||||
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
@ -338,11 +340,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, 1.0
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -357,11 +359,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
@ -383,11 +385,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@ -406,11 +409,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v1, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v1, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v1, v2
|
||||
; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
@ -437,11 +441,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
@ -460,13 +465,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v4, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: v_mov_b32_e32 v3, v1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v4, v2
|
||||
; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
@ -491,11 +497,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v0, v0, s[0:1] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
||||
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
@ -514,11 +521,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v0, v1
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
|
@ -44,11 +44,12 @@ define amdgpu_ps float @valley_partially_undef_copy() #0 {
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_mov_b32 s3, 0xf000
|
||||
; CHECK-NEXT: s_mov_b32 s2, -1
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0
|
||||
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, 0x7fc00000
|
||||
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(1)
|
||||
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
|
||||
; CHECK-NEXT: BB1_1: ; %bb9
|
||||
@ -102,6 +103,7 @@ define amdgpu_kernel void @partially_undef_copy() #0 {
|
||||
; CHECK-NEXT: s_mov_b32 s2, -1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, v6
|
||||
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: v_nop
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
|
@ -42,12 +42,16 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
|
||||
; GCN-LABEL: {{^}}test_use_s_v_s:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SA:[0-9]+]]:[[SB:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; SI: buffer_load_dword [[VA0:v[0-9]+]]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-NOT: v_mov_b32
|
||||
|
||||
; VI: buffer_load_dword [[VA0:v[0-9]+]]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]]
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]]
|
||||
|
@ -3,7 +3,7 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}mac_vvv:
|
||||
; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
|
||||
; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
|
||||
; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4
|
||||
; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
|
||||
; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]]
|
||||
|
@ -82,19 +82,19 @@ define amdgpu_kernel void @madak_f16_use_2(
|
||||
; SI-NEXT: s_mov_b32 s11, s3
|
||||
; SI-NEXT: s_mov_b32 s14, s2
|
||||
; SI-NEXT: s_mov_b32 s15, s3
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
|
||||
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
|
||||
; SI-NEXT: s_mov_b32 s0, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_mov_b32 s8, s6
|
||||
; SI-NEXT: s_mov_b32 s9, s7
|
||||
; SI-NEXT: s_waitcnt vmcnt(2)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000
|
||||
; SI-NEXT: v_mac_f32_e32 v3, v0, v2
|
||||
@ -121,17 +121,18 @@ define amdgpu_kernel void @madak_f16_use_2(
|
||||
; VI-NEXT: s_mov_b32 s11, s3
|
||||
; VI-NEXT: s_mov_b32 s14, s2
|
||||
; VI-NEXT: s_mov_b32 s15, s3
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
|
||||
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x4900
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_mov_b32 s8, s6
|
||||
; VI-NEXT: s_mov_b32 s9, s7
|
||||
; VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mac_f16_e32 v3, v0, v2
|
||||
; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
|
||||
; VI-NEXT: buffer_store_short v3, off, s[8:11], 0
|
||||
|
@ -114,9 +114,9 @@ define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)*
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v5, v2
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64 glc
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
|
Loading…
x
Reference in New Issue
Block a user