c performance algorithm opencl measurement

C vs OpenCL, how to compare results of time measurement?

So, in the other post I questioned about C time measurement. Now, I wanna know how to compare the result of the C "function" vs the OpenCL "function"

This is the code of the host OpenCL and C

#define PROGRAM_FILE "sum.cl"
#define KERNEL_FUNC "float_sum"
#define ARRAY_SIZE 1000000


#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include <CL/cl.h>

int main()
{
    /* OpenCL Data structures */

    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_program program;
    cl_kernel kernel;    
    cl_command_queue queue;
    cl_mem vec_buffer, result_buffer;

    cl_event prof_event;;

    /* ********************* */

    /* C Data Structures / Data types */
    FILE *program_handle; //Kernel file handle
    char *program_buffer; //Kernel buffer

    float *vec, *non_parallel;
    float result[ARRAY_SIZE];

    size_t program_size; //Kernel file size

    cl_ulong time_start, time_end, total_time;

    int i;
    /* ****************************** */

    /* Errors */
    cl_int err;
    /* ****** */

    non_parallel = (float*)malloc(ARRAY_SIZE * sizeof(float));
    vec          = (float*)malloc(ARRAY_SIZE * sizeof(float));

    //Initialize the vector of floats
    for(i = 0; i < ARRAY_SIZE; i++)
    vec[i] = i + 1;

    /************************* C Function **************************************/
    clock_t start, end;

    start = clock();

    for( i = 0; i < ARRAY_SIZE; i++) 
    {
    non_parallel[i] = vec[i] * vec[i];
    }
    end = clock();
    printf( "Number of seconds: %f\n", (clock()-start)/(double)CLOCKS_PER_SEC );

    free(non_parallel);
    /***************************************************************************/




    clGetPlatformIDs(1, &platform, NULL);//Just want NVIDIA platform
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

    // Context error?
    if(err)
    {
    perror("Cannot create context");
    return 1;
    }

    //Read the kernel file
    program_handle = fopen(PROGRAM_FILE,"r");
    fseek(program_handle, 0, SEEK_END);
    program_size = ftell(program_handle);
    rewind(program_handle);

    program_buffer = (char*)malloc(program_size + 1);
    program_buffer[program_size] = '\0';
    fread(program_buffer, sizeof(char), program_size, program_handle);
    fclose(program_handle);

    //Create the program
    program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, 
                    &program_size, &err);

    if(err)
    {
    perror("Cannot create program");
    return 1;
    }

    free(program_buffer);

    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    kernel = clCreateKernel(program, KERNEL_FUNC, &err);

    if(err)
    {
    perror("Cannot create kernel");
    return 1;
    }

    queue = clCreateCommandQueue(context, device, CL_QUEU_PROFILING_ENABLE, &err);

    if(err)
    {
    perror("Cannot create command queue");
    return 1;
    }

    vec_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                sizeof(float) * ARRAY_SIZE, vec, &err);
    result_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*ARRAY_SIZE, NULL, &err);

    if(err)
    {
    perror("Cannot create the vector buffer");
    return 1;
    }

    clSetKernelArg(kernel, 0, sizeof(cl_mem), &vec_buffer);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &result_buffer);

    size_t global_size = ARRAY_SIZE;
    size_t local_size = 0;

    clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &prof_event);

    clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, sizeof(float)*ARRAY_SIZE, &result, 0, NULL, NULL);
    clFinish(queue);



     clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
           sizeof(time_start), &time_start, NULL);
     clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
           sizeof(time_end), &time_end, NULL);
     total_time += time_end - time_start;

    printf("\nAverage time in nanoseconds = %lu\n", total_time/ARRAY_SIZE);



    clReleaseMemObject(vec_buffer);
    clReleaseMemObject(result_buffer);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseProgram(program);
    clReleaseContext(context);

    free(vec);

    return 0;
}

And the kernel is:

__kernel void float_sum(__global float* vec,__global float* result){
    int gid = get_global_id(0);
    result[gid] = vec[gid] * vec[gid];
}

Now, the results are:

Number of seconds: 0.010000 <- This is the for the C code

Average time in nanoseconds = 140737284 <- OpenCL function

0,1407 seconds is the time of the OpenCL time kernel execution, and it's more than the C function, is it correct? Beacause I think OpenCL should be fastest than C non parallel algorithm...

Solution

Executing parallel code on the GPU is not necessarily faster that executing on the CPU. Take into account that you also have to transfer the data to and from the GPU memory in addition to the computations.

In your example you are transferring 2 * N items and doing an O(N) operation in parallel, which is a very inefficient use of the GPU. Therefore, it's quite likely that the CPU is indeed faster for this particular computation.