let t = if warpid = 0 then mean.[i / num_rows] else (Unchecked.defaultof<'T>)
__syncthreads()
let v = __shfl t 0 32
I want to get the default value of 'T, but the above snippet gives a compile error due to Unchecked.defaultof<'T>. What would be the preferred way of doing a warp shuffle in Alea?
Right now as I have a problem where many threads read from the same location once, I am trying test whether it would be more efficient to have only the first thread read from that spot and and then shuffle the value to the others in the warp. (Edit: Not at all. The cache is doing its job nicely.)
It is a good idea to support Unchecked.defaultof
, I will check this, thanks.
Currently there are two ways to get a default value of type 'T
:
Use Alea.CUDA.Intrinsic.__default_value<'T>()
(see here). Intrinsic
is an auto-open module, so if you opened Alea.CUDA
namespace, you can directly use __default_value()
in your code.
The second way is to open namespace Alea.CUDA.Utilities
, and use the auto-opened NumericLiteralG
module (see here), then in your inline generic function you can directly write things like 0G
, 1G
, etc..
For your second question, I paste some source code of the helper warp shuffle type, which includes broadcasting usage. These helper static methods are available in module Alea.CUDA.Intrinsic
:
///A helper static class providing shuffle instructions.
[<AbstractClass;Sealed>]
type WarpShuffle private () =
[<ReflectedDefinition>]
static member Broadcast(input:'T, srcLane:int, width:int) =
__shfl input srcLane width
[<ReflectedDefinition>]
static member Broadcast(input:'T, srcLane:int) =
let width = __warp_size()
__shfl input srcLane width
[<ReflectedDefinition>]
static member Up(input:'T, delta:int, width:int) =
__shfl_up input delta width
[<ReflectedDefinition>]
static member Up(input:'T, delta:int) =
let width = __warp_size()
__shfl_up input delta width
[<ReflectedDefinition>]
static member Down(input:'T, delta:int, width:int) =
__shfl_down input delta width
[<ReflectedDefinition>]
static member Down(input:'T, delta:int) =
let width = __warp_size()
__shfl_down input delta width
[<ReflectedDefinition>]
static member Xor(input:'T, laneMask:int, width:int) =
__shfl_xor input laneMask width
[<ReflectedDefinition>]
static member Xor(input:'T, laneMask:int) =
let width = __warp_size()
__shfl_xor input laneMask width
///[omit]
[<AbstractClass;Sealed>]
type FullWarpShuffle private () =
[<ReflectedDefinition>]
static member Broadcast(input:'T, srcLane:int, logicWarpThreads:int) =
let shflC = logicWarpThreads - 1
__shfl_raw input srcLane shflC
[<ReflectedDefinition>]
static member Broadcast(input:'T, srcLane:int) =
let shflC = __warp_size() - 1
__shfl_raw input srcLane shflC
[<ReflectedDefinition>]
static member Up(input:'T, srcOffset:int) =
let shflC = 0
__shfl_up_raw input srcOffset shflC
[<ReflectedDefinition>]
static member Down(input:'T, srcOffset:int) =
let shflC = __warp_size() - 1
__shfl_down_raw input srcOffset shflC
[<ReflectedDefinition>]
static member Down(input:'T, srcOffset:int, warpThreads:int) =
let shflC = warpThreads - 1
__shfl_down_raw input srcOffset shflC
In the code above it used __shf_raw
, which the online doc is out-of-date. this is the raw version of ptx code shfl.idx
, where the shflC
contains two packed values specifying a mask for logically splitting warps into sub-segments and an upper bound for clamping the source lane index. Read more at here.