Search code examples
ccudathrust

Efficiency of CUDA vector types (float2, float3, float4)


I'm trying to understand the integrate_functor in particles_kernel.cu from CUDA examples:

struct integrate_functor
{
    float deltaTime;    
    //constructor for functor
    //...

    template <typename Tuple>
    __device__
    void operator()(Tuple t)
    {
        volatile float4 posData = thrust::get<2>(t);
        volatile float4 velData = thrust::get<3>(t);

        float3 pos = make_float3(posData.x, posData.y, posData.z);
        float3 vel = make_float3(velData.x, velData.y, velData.z);

        // update position and velocity
        // ...

        // store new position and velocity
        thrust::get<0>(t) = make_float4(pos, posData.w);
        thrust::get<1>(t) = make_float4(vel, velData.w);
    }
};

We call make_float4(pos, age) but make_float4 is defined in vector_functions.h as

static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w)
{
    float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
} 

Are CUDA vector types (float3 and float4) more efficient for the GPU and how does the compiler know how to overload the function make_float4?


Solution

  • I'm expanding njuffa's comment into a worked example. In that example, I'm simply adding two arrays in three different ways: loading the data as float, float2 or float4.

    These are the timings on a GT540M and on a Kepler K20c card:

    GT540M
    float  - Elapsed time:  74.1 ms
    float2 - Elapsed time:  61.0 ms
    float4 - Elapsed time:  56.1 ms
    
    Kepler K20c
    float  - Elapsed time:  4.4 ms 
    float2 - Elapsed time:  3.3 ms 
    float4 - Elapsed time:  3.2 ms
    

    As it can be seen, loading the data as float4 is the fastest approach.

    Below are the disassembled codes for the three kernels (compilation for compute capability 2.1).

    add_float

            Function : _Z9add_floatPfS_S_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R2, SR_TID.X;                               /* 0x2c00000084009c04 */
    /*0010*/         SHL R2, R2, 0x2;                                /* 0x6000c00008209c03 */
    /*0018*/         S2R R0, SR_CTAID.X;                             /* 0x2c00000094001c04 */
    /*0020*/         SHL R0, R0, 0x2;                                /* 0x6000c00008001c03 */
    /*0028*/         IMAD R0, R0, c[0x0][0x8], R2;                   /* 0x2004400020001ca3 */
    /*0030*/         ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT;  /* 0x1b0e4000b001dc03 */
    /*0038*/     @P0 BRA.U 0xd8;                                     /* 0x40000002600081e7 */
    /*0040*/    @!P0 ISCADD R2, R0, c[0x0][0x24], 0x2;               /* 0x400040009000a043 */
    /*0048*/    @!P0 ISCADD R10, R0, c[0x0][0x20], 0x2;              /* 0x400040008002a043 */
    /*0050*/    @!P0 ISCADD R0, R0, c[0x0][0x28], 0x2;               /* 0x40004000a0002043 */
    /*0058*/    @!P0 LD R8, [R2];                                    /* 0x8000000000222085 */
    /*0060*/    @!P0 LD R6, [R2+0x4];                                /* 0x800000001021a085 */
    /*0068*/    @!P0 LD R4, [R2+0x8];                                /* 0x8000000020212085 */
    /*0070*/    @!P0 LD R9, [R10];                                   /* 0x8000000000a26085 */
    /*0078*/    @!P0 LD R7, [R10+0x4];                               /* 0x8000000010a1e085 */
    /*0080*/    @!P0 LD R5, [R10+0x8];                               /* 0x8000000020a16085 */
    /*0088*/    @!P0 LD R3, [R10+0xc];                               /* 0x8000000030a0e085 */
    /*0090*/    @!P0 LD R2, [R2+0xc];                                /* 0x800000003020a085 */
    /*0098*/    @!P0 FADD R8, R9, R8;                                /* 0x5000000020922000 */
    /*00a0*/    @!P0 FADD R6, R7, R6;                                /* 0x500000001871a000 */
    /*00a8*/    @!P0 FADD R4, R5, R4;                                /* 0x5000000010512000 */
    /*00b0*/    @!P0 ST [R0], R8;                                    /* 0x9000000000022085 */
    /*00b8*/    @!P0 FADD R2, R3, R2;                                /* 0x500000000830a000 */
    /*00c0*/    @!P0 ST [R0+0x4], R6;                                /* 0x900000001001a085 */
    /*00c8*/    @!P0 ST [R0+0x8], R4;                                /* 0x9000000020012085 */
    /*00d0*/    @!P0 ST [R0+0xc], R2;                                /* 0x900000003000a085 */
    /*00d8*/         EXIT;                                           /* 0x8000000000001de7 */
    

    add_float2

            Function : _Z10add_float2P6float2S0_S0_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
    /*0008*/         S2R R2, SR_TID.X;                               /* 0x2c00000084009c04 */
    /*0010*/         SHL R2, R2, 0x1;                                /* 0x6000c00004209c03 */
    /*0018*/         S2R R0, SR_CTAID.X;                             /* 0x2c00000094001c04 */
    /*0020*/         SHL R0, R0, 0x1;                                /* 0x6000c00004001c03 */
    /*0028*/         IMAD R0, R0, c[0x0][0x8], R2;                   /* 0x2004400020001ca3 */
    /*0030*/         ISETP.GE.U32.AND P0, PT, R0, c[0x0][0x2c], PT;  /* 0x1b0e4000b001dc03 */
    /*0038*/     @P0 BRA.U 0xa8;                                     /* 0x40000001a00081e7 */
    /*0040*/    @!P0 ISCADD R10, R0, c[0x0][0x20], 0x3;              /* 0x400040008002a063 */
    /*0048*/    @!P0 ISCADD R11, R0, c[0x0][0x24], 0x3;              /* 0x400040009002e063 */
    /*0050*/    @!P0 ISCADD R0, R0, c[0x0][0x28], 0x3;               /* 0x40004000a0002063 */
    /*0058*/    @!P0 LD.64 R4, [R10];                                /* 0x8000000000a120a5 */
    /*0060*/    @!P0 LD.64 R8, [R11];                                /* 0x8000000000b220a5 */
    /*0068*/    @!P0 LD.64 R2, [R10+0x8];                            /* 0x8000000020a0a0a5 */
    /*0070*/    @!P0 LD.64 R6, [R11+0x8];                            /* 0x8000000020b1a0a5 */
    /*0078*/    @!P0 FADD R9, R5, R9;                                /* 0x5000000024526000 */
    /*0080*/    @!P0 FADD R8, R4, R8;                                /* 0x5000000020422000 */
    /*0088*/    @!P0 FADD R3, R3, R7;                                /* 0x500000001c30e000 */
    /*0090*/    @!P0 FADD R2, R2, R6;                                /* 0x500000001820a000 */
    /*0098*/    @!P0 ST.64 [R0], R8;                                 /* 0x90000000000220a5 */
    /*00a0*/    @!P0 ST.64 [R0+0x8], R2;                             /* 0x900000002000a0a5 */
    /*00a8*/         EXIT;                                           /* 0x8000000000001de7 */
    

    add_float4

            Function : _Z10add_float4P6float4S0_S0_j
    .headerflags    @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)"
    /*0000*/         MOV R1, c[0x1][0x100];                  /* 0x2800440400005de4 */
    /*0008*/         NOP;                                    /* 0x4000000000001de4 */
    /*0010*/         MOV R3, c[0x0][0x2c];                   /* 0x28004000b000dde4 */
    /*0018*/         S2R R0, SR_CTAID.X;                     /* 0x2c00000094001c04 */
    /*0020*/         SHR.U32 R3, R3, 0x2;                    /* 0x5800c0000830dc03 */
    /*0028*/         S2R R2, SR_TID.X;                       /* 0x2c00000084009c04 */
    /*0030*/         IMAD R0, R0, c[0x0][0x8], R2;           /* 0x2004400020001ca3 */
    /*0038*/         ISETP.GE.U32.AND P0, PT, R0, R3, PT;    /* 0x1b0e00000c01dc03 */
    /*0040*/     @P0 BRA.U 0x98;                             /* 0x40000001400081e7 */
    /*0048*/    @!P0 ISCADD R2, R0, c[0x0][0x20], 0x4;       /* 0x400040008000a083 */
    /*0050*/    @!P0 ISCADD R3, R0, c[0x0][0x24], 0x4;       /* 0x400040009000e083 */
    /*0058*/    @!P0 ISCADD R0, R0, c[0x0][0x28], 0x4;       /* 0x40004000a0002083 */
    /*0060*/    @!P0 LD.128 R8, [R2];                        /* 0x80000000002220c5 */
    /*0068*/    @!P0 LD.128 R4, [R3];                        /* 0x80000000003120c5 */
    /*0070*/    @!P0 FADD R7, R11, R7;                       /* 0x500000001cb1e000 */
    /*0078*/    @!P0 FADD R6, R10, R6;                       /* 0x5000000018a1a000 */
    /*0080*/    @!P0 FADD R5, R9, R5;                        /* 0x5000000014916000 */
    /*0088*/    @!P0 FADD R4, R8, R4;                        /* 0x5000000010812000 */
    /*0090*/    @!P0 ST.128 [R0], R4;                        /* 0x90000000000120c5 */
    /*0098*/         EXIT;                                   /* 0x8000000000001de7 */
    

    As it can be seen and as mentioned by njuffa, different load instructions are used for the three cases: LD, LD.64 and LD.128, respectively.

    Finally, the code:

    #include <thrust/device_vector.h>
    
    #define BLOCKSIZE 256
    
    /*******************/
    /* iDivUp FUNCTION */
    /*******************/
    int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
    {
       if (code != cudaSuccess) 
       {
          fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
          if (abort) exit(code);
       }
    }
    
    /********************/
    /* ADD_FLOAT KERNEL */
    /********************/
    __global__ void add_float(float *d_a, float *d_b, float *d_c, unsigned int N) {
    
        const int tid = 4 * threadIdx.x + blockIdx.x * (4 * blockDim.x);
    
        if (tid < N) {
    
            float a1 = d_a[tid];
            float b1 = d_b[tid];
    
            float a2 = d_a[tid+1];
            float b2 = d_b[tid+1];
    
            float a3 = d_a[tid+2];
            float b3 = d_b[tid+2];
    
            float a4 = d_a[tid+3];
            float b4 = d_b[tid+3];
    
            float c1 = a1 + b1;
            float c2 = a2 + b2;
            float c3 = a3 + b3;
            float c4 = a4 + b4;
    
            d_c[tid] = c1;
            d_c[tid+1] = c2;
            d_c[tid+2] = c3;
            d_c[tid+3] = c4;
    
            //if ((tid < 1800) && (tid > 1790)) {
                //printf("%i %i %i %f %f %f\n", tid, threadIdx.x, blockIdx.x, a1, b1, c1);
                //printf("%i %i %i %f %f %f\n", tid+1, threadIdx.x, blockIdx.x, a2, b2, c2);
                //printf("%i %i %i %f %f %f\n", tid+2, threadIdx.x, blockIdx.x, a3, b3, c3);
                //printf("%i %i %i %f %f %f\n", tid+3, threadIdx.x, blockIdx.x, a4, b4, c4);
            //}
    
        }
    
    }
    
    /*********************/
    /* ADD_FLOAT2 KERNEL */
    /*********************/
    __global__ void add_float2(float2 *d_a, float2 *d_b, float2 *d_c, unsigned int N) {
    
        const int tid = 2 * threadIdx.x + blockIdx.x * (2 * blockDim.x);
    
        if (tid < N) {
    
            float2 a1 = d_a[tid];
            float2 b1 = d_b[tid];
    
            float2 a2 = d_a[tid+1];
            float2 b2 = d_b[tid+1];
    
            float2 c1;
            c1.x = a1.x + b1.x;
            c1.y = a1.y + b1.y;
    
            float2 c2;
            c2.x = a2.x + b2.x;
            c2.y = a2.y + b2.y;
    
            d_c[tid] = c1;
            d_c[tid+1] = c2;
    
        }
    
    }
    
    /*********************/
    /* ADD_FLOAT4 KERNEL */
    /*********************/
    __global__ void add_float4(float4 *d_a, float4 *d_b, float4 *d_c, unsigned int N) {
    
        const int tid = 1 * threadIdx.x + blockIdx.x * (1 * blockDim.x);
    
        if (tid < N/4) {
    
            float4 a1 = d_a[tid];
            float4 b1 = d_b[tid];
    
            float4 c1;
            c1.x = a1.x + b1.x;
            c1.y = a1.y + b1.y;
            c1.z = a1.z + b1.z;
            c1.w = a1.w + b1.w;
    
            d_c[tid] = c1;
    
        }
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main() {
    
        const int N = 4*10000000;
    
        const float a = 3.f;
        const float b = 5.f;
    
        // --- float
    
        thrust::device_vector<float> d_A(N, a);
        thrust::device_vector<float> d_B(N, b);
        thrust::device_vector<float> d_C(N);
    
        float time;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        add_float<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_A.data()), thrust::raw_pointer_cast(d_B.data()), thrust::raw_pointer_cast(d_C.data()), N);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time:  %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        thrust::host_vector<float> h_float = d_C;
        for (int i=0; i<N; i++) {
            if (h_float[i] != (a+b)) {
                printf("Error for add_float at %i: result is %f\n",i, h_float[i]);
                return -1;
            }
        }
    
        // --- float2
    
        thrust::device_vector<float> d_A2(N, a);
        thrust::device_vector<float> d_B2(N, b);
        thrust::device_vector<float> d_C2(N);
    
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        add_float2<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float2*)thrust::raw_pointer_cast(d_A2.data()), (float2*)thrust::raw_pointer_cast(d_B2.data()), (float2*)thrust::raw_pointer_cast(d_C2.data()), N);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time:  %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        thrust::host_vector<float> h_float2 = d_C2;
        for (int i=0; i<N; i++) {
            if (h_float2[i] != (a+b)) {
                printf("Error for add_float2 at %i: result is %f\n",i, h_float2[i]);
                return -1;
            }
        }
    
        // --- float4
    
        thrust::device_vector<float> d_A4(N, a);
        thrust::device_vector<float> d_B4(N, b);
        thrust::device_vector<float> d_C4(N);
    
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        add_float4<<<iDivUp(N/4, BLOCKSIZE), BLOCKSIZE>>>((float4*)thrust::raw_pointer_cast(d_A4.data()), (float4*)thrust::raw_pointer_cast(d_B4.data()), (float4*)thrust::raw_pointer_cast(d_C4.data()), N);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Elapsed time:  %3.1f ms \n", time); gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        thrust::host_vector<float> h_float4 = d_C4;
        for (int i=0; i<N; i++) {
            if (h_float4[i] != (a+b)) {
                printf("Error for add_float4 at %i: result is %f\n",i, h_float4[i]);
                return -1;
            }
        }
    
        return 0;
    }