mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 11:02:59 +02:00
[NVPTX] Implemented shfl.sync instruction and supporting intrinsics/builtins.
Differential Revision: https://reviews.llvm.org/D38090 llvm-svn: 313820
This commit is contained in:
parent
8ad966131c
commit
bbbe792995
@ -3736,4 +3736,48 @@ def int_nvvm_shfl_idx_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_idx_f32">;
|
||||
|
||||
// Synchronizing shfl variants available in CUDA-9.
|
||||
// On sm_70 these don't have to be convergent, so we may eventually want to
|
||||
// implement non-convergent variant of this intrinsic.
|
||||
|
||||
// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp
|
||||
def int_nvvm_shfl_sync_down_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_down_i32">;
|
||||
def int_nvvm_shfl_sync_down_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_down_f32">;
|
||||
|
||||
// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_sync_up_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_up_i32">;
|
||||
def int_nvvm_shfl_sync_up_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_up_f32">;
|
||||
|
||||
// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp
|
||||
def int_nvvm_shfl_sync_bfly_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">;
|
||||
def int_nvvm_shfl_sync_bfly_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">;
|
||||
|
||||
// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp
|
||||
def int_nvvm_shfl_sync_idx_i32 :
|
||||
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_idx_i32">;
|
||||
def int_nvvm_shfl_sync_idx_f32 :
|
||||
Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">,
|
||||
GCCBuiltin<"__nvvm_shfl_sync_idx_f32">;
|
||||
}
|
||||
|
@ -111,8 +111,80 @@ defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
|
||||
defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
|
||||
defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
|
||||
|
||||
} // isConvergent = 1
|
||||
multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
|
||||
// Threadmask and the last two parameters to shfl.sync can be regs or imms.
|
||||
// ptxas is smart enough to inline constant registers, so strictly speaking we
|
||||
// don't need to handle immediates here. But it's easy enough, and it makes
|
||||
// our ptx more readable.
|
||||
def rrr : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def rri : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, imm:$mask))]>;
|
||||
|
||||
def rir : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
imm:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def rii : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
|
||||
imm:$offset, imm:$mask))]>;
|
||||
|
||||
def irr : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def iri : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
Int32Regs:$offset, imm:$mask))]>;
|
||||
|
||||
def iir : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
imm:$offset, Int32Regs:$mask))]>;
|
||||
|
||||
def iii : NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
|
||||
!strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
|
||||
[(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
|
||||
imm:$offset, imm:$mask))]>;
|
||||
}
|
||||
|
||||
// On sm_70 these don't have to be convergent, so we may eventually want to
|
||||
// implement non-convergent variant of this intrinsic.
|
||||
defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
|
||||
defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
|
||||
defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
|
||||
defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
|
||||
defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
|
||||
defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
|
||||
defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
|
||||
defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
|
||||
|
||||
} // isConvergent = 1
|
||||
|
||||
//-----------------------------------
|
||||
// Explicit Memory Fence Functions
|
||||
|
94
test/CodeGen/NVPTX/shfl-sync.ll
Normal file
94
test/CodeGen/NVPTX/shfl-sync.ll
Normal file
@ -0,0 +1,94 @@
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
|
||||
|
||||
declare i32 @llvm.nvvm.shfl.sync.down.i32(i32, i32, i32, i32)
|
||||
declare float @llvm.nvvm.shfl.sync.down.f32(float, i32, i32, i32)
|
||||
declare i32 @llvm.nvvm.shfl.sync.up.i32(i32, i32, i32, i32)
|
||||
declare float @llvm.nvvm.shfl.sync.up.f32(float, i32, i32, i32)
|
||||
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32)
|
||||
declare float @llvm.nvvm.shfl.sync.bfly.f32(float, i32, i32, i32)
|
||||
declare i32 @llvm.nvvm.shfl.sync.idx.i32(i32, i32, i32, i32)
|
||||
declare float @llvm.nvvm.shfl.sync.idx.f32(float, i32, i32, i32)
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.rrr
|
||||
define i32 @shfl.sync.rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 %c)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.irr
|
||||
define i32 @shfl.sync.irr(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 %c)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.rri
|
||||
define i32 @shfl.sync.rri(i32 %mask, i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 1, [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 1)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.iri
|
||||
define i32 @shfl.sync.iri(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[B:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 2, 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 2)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.rir
|
||||
define i32 @shfl.sync.rir(i32 %mask, i32 %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, [[C]], [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 %c)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.iir
|
||||
define i32 @shfl.sync.iir(i32 %a, i32 %c) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[C:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, [[C]], 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 %c)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.rii
|
||||
define i32 @shfl.sync.rii(i32 %mask, i32 %a) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, 2, [[MASK]];
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 2)
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}shfl.sync.iii
|
||||
define i32 @shfl.sync.iii(i32 %a, i32 %b) {
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]]
|
||||
; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, 3, 1;
|
||||
; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
|
||||
%val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 3)
|
||||
ret i32 %val
|
||||
}
|
Loading…
Reference in New Issue
Block a user