Are conversions from float* to float3* in CUDA safe?

I have just started diving into CUDA code, and its a bit of a blast from the past, lots of pointer access and type conversions through pointers using reinterpret_cast. I have a specific case that I wanted to check on, I have seen the following instance of type punning in the code:

__device__ void func(__restrict__ float* const points, size_t size, __restrict__ float* outputPoints) {

    for (size_t index = 0; index < size; index += 3) {
        float3* const point = reinterpret_cast<float3* const>(points + index);
        float3* const output = reinterpret_cast<float3* const>(outputPoints + index);
        // operations using point;
    }
}

In CUDA you are provided a struct float3 which looks like:

struct float3 {
    float x, y, z
}

Is this behaviour guaranteed to be safe? This is obviously some type punning, but I am pretty worried that there might be some padding or alignment or something that will ruin access in this way. If someone is able to give further insight on how the cuda compiler will deal with this, because I know that it does some very heavy optimizations as well. Can these cause problems?

Solution

CUDA guarantees that the size of those built-in types will be consistent between host and device without padding interventions (no such guarantees exist for user defined structures and classes).

There are basic requirements for alignment on the device, such that the storage you read must be aligned to the size of the read. So you couldn't read a float3 from an arbitrary byte boundary, but you will be safe reading from a 32 bit aligned boundary, and the memory allocation APIs which CUDA exposes on the host and device guarantee the necessary alignment to make the code you posted is safe.

The code you have posted (when modified to defeat dead code removal), basically just emits three 32 bit loads and three 32 bit stores. CUDA only has a limited number of native transaction sizes, and they don't map to a 96 bit per thread request, so there is absolutely no optimization by doing this:

__device__ void func(float* const points, size_t size, float* outputPoints) {

    for (size_t index = 0; index < size; index += 3) {
        float3* point = reinterpret_cast<float3*>(points + index);
        float3* output = reinterpret_cast<float3*>(outputPoints + index);

    float3 val = *point;
    val.x += 1.f; val.y += 2.f; val.z += 3.f;
    *output = val;
    }
}

which does this:

$ nvcc -arch=sm_75 -std=c++11 -dc -ptx fffloat3.cu 
$ tail -40 fffloat3.ptx 
    // .globl   _Z4funcPfmS_
.visible .func _Z4funcPfmS_(
    .param .b64 _Z4funcPfmS__param_0,
    .param .b64 _Z4funcPfmS__param_1,
    .param .b64 _Z4funcPfmS__param_2
)
{
    .reg .pred  %p<3>;
    .reg .f32   %f<7>;
    .reg .b64   %rd<14>;


    ld.param.u64    %rd11, [_Z4funcPfmS__param_0];
    ld.param.u64    %rd8, [_Z4funcPfmS__param_1];
    ld.param.u64    %rd12, [_Z4funcPfmS__param_2];
    setp.eq.s64 %p1, %rd8, 0;
    mov.u64     %rd13, 0;
    @%p1 bra    BB6_2;

BB6_1:
    ld.f32  %f1, [%rd11];
    ld.f32  %f2, [%rd11+4];
    ld.f32  %f3, [%rd11+8];
    add.f32     %f4, %f1, 0f3F800000;
    add.f32     %f5, %f2, 0f40000000;
    add.f32     %f6, %f3, 0f40400000;
    st.f32  [%rd12], %f4;
    st.f32  [%rd12+4], %f5;
    st.f32  [%rd12+8], %f6;
    add.s64     %rd12, %rd12, 12;
    add.s64     %rd11, %rd11, 12;
    add.s64     %rd13, %rd13, 3;
    setp.lt.u64 %p2, %rd13, %rd8;
    @%p2 bra    BB6_1;

BB6_2:
    ret;
}

i.e. all that casting is both syntactically bogus and pointless.

If you were to change to float2, which is a 64 bit request per thread and can be vectorized, so get this:

.visible .func _Z4funcPfmS_(
    .param .b64 _Z4funcPfmS__param_0,
    .param .b64 _Z4funcPfmS__param_1,
    .param .b64 _Z4funcPfmS__param_2
)
{
    .reg .pred  %p<3>;
    .reg .f32   %f<7>;
    .reg .b64   %rd<14>;


    ld.param.u64    %rd12, [_Z4funcPfmS__param_0];
    ld.param.u64    %rd8, [_Z4funcPfmS__param_1];
    ld.param.u64    %rd11, [_Z4funcPfmS__param_2];
    setp.eq.s64 %p1, %rd8, 0;
    mov.u64     %rd13, 0;
    @%p1 bra    BB6_2;

BB6_1:
    ld.v2.f32   {%f1, %f2}, [%rd12];
    add.f32     %f5, %f2, 0f40000000;
    add.f32     %f6, %f1, 0f3F800000;
    st.v2.f32   [%rd11], {%f6, %f5};
    add.s64     %rd12, %rd12, 8;
    add.s64     %rd11, %rd11, 8;
    add.s64     %rd13, %rd13, 2;
    setp.lt.u64 %p2, %rd13, %rd8;
    @%p2 bra    BB6_1;

BB6_2:
    ret;
}

Note that the loads and stores are now using a vectorized version of the instructions. Same with float4:

    // .globl   _Z4funcPfmS_
.visible .func _Z4funcPfmS_(
    .param .b64 _Z4funcPfmS__param_0,
    .param .b64 _Z4funcPfmS__param_1,
    .param .b64 _Z4funcPfmS__param_2
)
{
    .reg .pred  %p<3>;
    .reg .f32   %f<12>;
    .reg .b64   %rd<14>;


    ld.param.u64    %rd12, [_Z4funcPfmS__param_0];
    ld.param.u64    %rd8, [_Z4funcPfmS__param_1];
    ld.param.u64    %rd11, [_Z4funcPfmS__param_2];
    setp.eq.s64 %p1, %rd8, 0;
    mov.u64     %rd13, 0;
    @%p1 bra    BB6_2;

BB6_1:
    ld.v4.f32   {%f1, %f2, %f3, %f4}, [%rd12];
    add.f32     %f9, %f3, 0f40400000;
    add.f32     %f10, %f2, 0f40000000;
    add.f32     %f11, %f1, 0f3F800000;
    st.v4.f32   [%rd11], {%f11, %f10, %f9, %f4};
    add.s64     %rd12, %rd12, 8;
    add.s64     %rd11, %rd11, 8;
    add.s64     %rd13, %rd13, 2;
    setp.lt.u64 %p2, %rd13, %rd8;
    @%p2 bra    BB6_1;

BB6_2:
    ret;
}

TLDR: Your concerns are valid, but the APIs and the compilers will handle sensible cases sensibly, but there are alignment and hardware limitations you should be extremely familiar with before trying to write "optimal code", because it is possible to write a lot of pointless nonsense unless you know exactly what you are doing.