mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[NVPTX] Restructure shfl instrinsics and add variants that return a predicate.
Also, amend constraints for non-sync variants that are no longer available on sm_70+ with PTX6.4+. Differential Revision: https://reviews.llvm.org/D68892 llvm-svn: 374790
This commit is contained in:
parent
eba5934469
commit
463904dd22
@ -276,6 +276,26 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
|
||||
);
|
||||
}
|
||||
|
||||
class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
|
||||
string Suffix = !if(sync, "sync_", "")
|
||||
# mode # "_"
|
||||
# type
|
||||
# !if(return_pred, "p", "");
|
||||
|
||||
string Name = "int_nvvm_shfl_" # Suffix;
|
||||
string Builtin = "__nvvm_shfl_" # Suffix;
|
||||
string IntrName = "llvm.nvvm.shfl." # !subst("_",".", Suffix);
|
||||
list<int> withGccBuiltin = !if(return_pred, [], [1]);
|
||||
list<int> withoutGccBuiltin = !if(return_pred, [1], []);
|
||||
LLVMType OpType = !cond(
|
||||
!eq(type,"i32"): llvm_i32_ty,
|
||||
!eq(type,"f32"): llvm_float_ty);
|
||||
list<LLVMType> RetTy = !if(return_pred, [OpType, llvm_i1_ty], [OpType]);
|
||||
list<LLVMType> ArgsTy = !if(sync,
|
||||
[llvm_i32_ty, OpType, llvm_i32_ty, llvm_i32_ty],
|
||||
[OpType, llvm_i32_ty, llvm_i32_ty]);
|
||||
}
|
||||
|
||||
let TargetPrefix = "nvvm" in {
|
||||
def int_nvvm_prmt : GCCBuiltin<"__nvvm_prmt">,
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
@ -3955,90 +3975,27 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
|
||||
//
|
||||
// SHUFFLE
|
||||
//
|
||||
|
||||
// shfl.down.b32 dest, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_down_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_down_i32">;
|
||||
def int_nvvm_shfl_down_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_down_f32">;
|
||||
|
||||
// shfl.up.b32 dest, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_up_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_up_i32">;
|
||||
def int_nvvm_shfl_up_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_up_f32">;
|
||||
|
||||
// shfl.bfly.b32 dest, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_bfly_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_bfly_i32">;
|
||||
def int_nvvm_shfl_bfly_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_bfly_f32">;
|
||||
|
||||
// shfl.idx.b32 dest, val, lane, mask_and_clamp
|
||||
def int_nvvm_shfl_idx_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_idx_i32">;
|
||||
def int_nvvm_shfl_idx_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_idx_f32">;
|
||||
|
||||
// Synchronizing shfl variants available in CUDA-9.
|
||||
// On sm_70 these don't have to be convergent, so we may eventually want to
|
||||
// implement non-convergent variant of this intrinsic.
|
||||
|
||||
// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp
|
||||
def int_nvvm_shfl_sync_down_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_down_i32">;
|
||||
def int_nvvm_shfl_sync_down_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_down_f32">;
|
||||
|
||||
// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_sync_up_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_up_i32">;
|
||||
def int_nvvm_shfl_sync_up_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_up_f32">;
|
||||
|
||||
// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_sync_bfly_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">;
|
||||
def int_nvvm_shfl_sync_bfly_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">;
|
||||
|
||||
// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp
|
||||
def int_nvvm_shfl_sync_idx_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_idx_i32">;
|
||||
def int_nvvm_shfl_sync_idx_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_idx_f32">;
|
||||
// Generate intrinsics for all variants of shfl instruction.
|
||||
foreach sync = [0, 1] in {
|
||||
foreach mode = ["up", "down", "bfly", "idx"] in {
|
||||
foreach type = ["i32", "f32"] in {
|
||||
foreach return_pred = [0, 1] in {
|
||||
foreach i = [SHFL_INFO<sync, mode, type, return_pred>] in {
|
||||
foreach _ = i.withGccBuiltin in {
|
||||
def i.Name : GCCBuiltin<i.Builtin>,
|
||||
Intrinsic<i.RetTy, i.ArgsTy,
|
||||
[IntrInaccessibleMemOnly, IntrConvergent],
|
||||
i.IntrName>;
|
||||
}
|
||||
foreach _ = i.withoutGccBuiltin in {
|
||||
def i.Name : Intrinsic<i.RetTy, i.ArgsTy,
|
||||
[IntrInaccessibleMemOnly, IntrConvergent], i.IntrName>;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// VOTE
|
||||
|
@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
|
||||
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
|
||||
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
|
||||
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
|
||||
def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
|
||||
|
||||
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
|
||||
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
|
||||
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
|
||||
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
|
||||
|
||||
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
|
||||
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
|
||||
"&& Subtarget->getPTXVersion() >= 64)">;
|
||||
|
||||
def useShortPtr : Predicate<"useShortPointers()">;
|
||||
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
|
||||
|
||||
|
@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> {
|
||||
[]);
|
||||
}
|
||||
|
||||
class THREADMASK_INFO<bit sync> {
|
||||
list<bit> ret = !if(sync, [0,1], [0]);
|
||||
}
|
||||
|
||||
//-----------------------------------
|
||||
// Synchronization and shuffle functions
|
||||
//-----------------------------------
|
||||
@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
|
||||
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
|
||||
Requires<[hasPTX60, hasSM30]>;
|
||||
|
||||
|
||||
// shfl.{up,down,bfly,idx}.b32
|
||||
multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
|
||||
// The last two parameters to shfl can be regs or imms. ptxas is smart
|
||||
// enough to inline constant registers, so strictly speaking we don't need to
|
||||
// handle immediates here. But it's easy enough, and it makes our ptx more
|
||||
// readable.
|
||||
def reg : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
|
||||
[(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def imm1 : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
|
||||
[(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def imm2 : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
|
||||
[(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
|
||||
|
||||
def imm3 : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins regclass:$src, i32imm:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
|
||||
[(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
|
||||
class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
|
||||
bit offset_imm, bit mask_imm, bit threadmask_imm>
|
||||
: NVPTXInst<(outs), (ins), "?", []> {
|
||||
NVPTXRegClass rc = !cond(
|
||||
!eq(reg, "i32"): Int32Regs,
|
||||
!eq(reg, "f32"): Float32Regs);
|
||||
string IntrName = "int_nvvm_shfl_"
|
||||
# !if(sync, "sync_", "")
|
||||
# mode
|
||||
# "_" # reg
|
||||
# !if(return_pred, "p", "");
|
||||
Intrinsic Intr = !cast<Intrinsic>(IntrName);
|
||||
let InOperandList = !con(
|
||||
!if(sync,
|
||||
!dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
|
||||
(ins)),
|
||||
(ins rc:$src),
|
||||
!dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
|
||||
!dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
|
||||
);
|
||||
let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
|
||||
let AsmString = "shfl."
|
||||
# !if(sync, "sync.", "")
|
||||
# mode # ".b32\t"
|
||||
# "$dst"
|
||||
# !if(return_pred, "|$pred", "") # ", "
|
||||
# "$src, $offset, $mask"
|
||||
# !if(sync, ", $threadmask", "")
|
||||
# ";"
|
||||
;
|
||||
let Pattern = [!con(
|
||||
!foreach(tmp, OutOperandList,
|
||||
!subst(outs, set,
|
||||
!subst(i32imm, imm, tmp))),
|
||||
(set !foreach(tmp, InOperandList,
|
||||
!subst(ins, Intr,
|
||||
!subst(i32imm, imm, tmp))))
|
||||
)];
|
||||
}
|
||||
|
||||
defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
|
||||
defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
|
||||
defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
|
||||
defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
|
||||
defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
|
||||
defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
|
||||
defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
|
||||
defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
|
||||
|
||||
multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
|
||||
// Threadmask and the last two parameters to shfl.sync can be regs or imms.
|
||||
// ptxas is smart enough to inline constant registers, so strictly speaking we
|
||||
// don't need to handle immediates here. But it's easy enough, and it makes
|
||||
// our ptx more readable.
|
||||
def rrr : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def rri : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, imm:$mask))]>;
|
||||
|
||||
def rir : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
imm:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def rii : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
imm:$offset, imm:$mask))]>;
|
||||
|
||||
def irr : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def iri : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, imm:$mask))]>;
|
||||
|
||||
def iir : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
imm:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def iii : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
imm:$offset, imm:$mask))]>;
|
||||
foreach sync = [0, 1] in {
|
||||
foreach mode = ["up", "down", "bfly", "idx"] in {
|
||||
foreach regclass = ["i32", "f32"] in {
|
||||
foreach return_pred = [0, 1] in {
|
||||
foreach offset_imm = [0, 1] in {
|
||||
foreach mask_imm = [0, 1] in {
|
||||
foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
|
||||
def : SHFL_INSTR<sync, mode, regclass, return_pred,
|
||||
offset_imm, mask_imm, threadmask_imm>,
|
||||
Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// On sm_70 these don't have to be convergent, so we may eventually want to
|
||||
// implement non-convergent variant of this intrinsic.
|
||||
defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
|
||||
defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
|
||||
defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
|
||||
defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
|
||||
defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
|
||||
defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
|
||||
defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
|
||||
defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
|
||||
|
||||
|
||||
// vote.{all,any,uni,ballot}
|
||||
multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
|
||||
def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
|
||||
|
172
test/CodeGen/NVPTX/shfl-p.ll
Normal file
172
test/CodeGen/NVPTX/shfl-p.ll
Normal file
@ -0,0 +1,172 @@
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
|
||||
|
||||
declare {i32, i1} @llvm.nvvm.shfl.down.i32p(i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.down.f32p(float, i32, i32)
|
||||
declare {i32, i1} @llvm.nvvm.shfl.up.i32p(i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.up.f32p(float, i32, i32)
|
||||
declare {i32, i1} @llvm.nvvm.shfl.bfly.i32p(i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.bfly.f32p(float, i32, i32)
|
||||
declare {i32, i1} @llvm.nvvm.shfl.idx.i32p(i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.idx.f32p(float, i32, i32)
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.rrr
|
||||
define {i32, i1} @shfl.i32.rrr(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.irr
|
||||
define {i32, i1} @shfl.i32.irr(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.rri
|
||||
define {i32, i1} @shfl.i32.rri(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 1)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.iri
|
||||
define {i32, i1} @shfl.i32.iri(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 2)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.rir
|
||||
define {i32, i1} @shfl.i32.rir(i32 %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.iir
|
||||
define {i32, i1} @shfl.i32.iir(i32 %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.rii
|
||||
define {i32, i1} @shfl.i32.rii(i32 %a) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 2)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.i32.iii
|
||||
define {i32, i1} @shfl.i32.iii(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 3)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
;; Same intrinsics, but for float
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.rrr
|
||||
define {float, i1} @shfl.f32.rrr(float %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.irr
|
||||
define {float, i1} @shfl.f32.irr(float %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.rri
|
||||
define {float, i1} @shfl.f32.rri(float %a, i32 %b) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 1)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.iri
|
||||
define {float, i1} @shfl.f32.iri(float %a, i32 %b) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 2)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.rir
|
||||
define {float, i1} @shfl.f32.rir(float %a, i32 %c) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.iir
|
||||
define {float, i1} @shfl.f32.iir(float %a, i32 %c) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.rii
|
||||
define {float, i1} @shfl.f32.rii(float %a) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 2)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.f32.iii
|
||||
define {float, i1} @shfl.f32.iii(float %a, i32 %b) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 3)
|
||||
ret {float, i1} %val
|
||||
}
|
180
test/CodeGen/NVPTX/shfl-sync-p.ll
Normal file
180
test/CodeGen/NVPTX/shfl-sync-p.ll
Normal file
@ -0,0 +1,180 @@
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
|
||||
|
||||
declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32)
|
||||
declare {i32, i1} @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32)
|
||||
declare {i32, i1} @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32)
|
||||
declare {i32, i1} @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32)
|
||||
declare {float, i1} @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32)
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rrr
|
||||
define {i32, i1} @shfl.sync.i32.rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.irr
|
||||
define {i32, i1} @shfl.sync.i32.irr(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rri
|
||||
define {i32, i1} @shfl.sync.i32.rri(i32 %mask, i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 1)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iri
|
||||
define {i32, i1} @shfl.sync.i32.iri(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 2)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rir
|
||||
define {i32, i1} @shfl.sync.i32.rir(i32 %mask, i32 %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iir
|
||||
define {i32, i1} @shfl.sync.i32.iir(i32 %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 %c)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.rii
|
||||
define {i32, i1} @shfl.sync.i32.rii(i32 %mask, i32 %a) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 2)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.i32.iii
|
||||
define {i32, i1} @shfl.sync.i32.iii(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 3)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
;; Same intrinsics, but for float
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rrr
|
||||
define {float, i1} @shfl.sync.f32.rrr(i32 %mask, float %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.irr
|
||||
define {float, i1} @shfl.sync.f32.irr(float %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rri
|
||||
define {float, i1} @shfl.sync.f32.rri(i32 %mask, float %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iri
|
||||
define {float, i1} @shfl.sync.f32.iri(float %a, i32 %b) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rir
|
||||
define {float, i1} @shfl.sync.f32.rir(i32 %mask, float %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iir
|
||||
define {float, i1} @shfl.sync.f32.iir(float %a, i32 %c) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.rii
|
||||
define {float, i1} @shfl.sync.f32.rii(i32 %mask, float %a) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2)
|
||||
ret {float, i1} %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.f32.iii
|
||||
define {float, i1} @shfl.sync.f32.iii(float %a, i32 %b) {
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3)
|
||||
ret {float, i1} %val
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user