So, in the other post I questioned about C time measurement. Now, I wanna know how to compare the result of the C "function" vs the OpenCL "function"
This is the code of the host OpenCL and C
#define PROGRAM_FILE "sum.cl"
#define KERNEL_FUNC "float_sum"
#define ARRAY_SIZE 1000000
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <CL/cl.h>
int main()
{
/* OpenCL Data structures */
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_mem vec_buffer, result_buffer;
cl_event prof_event;;
/* ********************* */
/* C Data Structures / Data types */
FILE *program_handle; //Kernel file handle
char *program_buffer; //Kernel buffer
float *vec, *non_parallel;
float result[ARRAY_SIZE];
size_t program_size; //Kernel file size
cl_ulong time_start, time_end, total_time;
int i;
/* ****************************** */
/* Errors */
cl_int err;
/* ****** */
non_parallel = (float*)malloc(ARRAY_SIZE * sizeof(float));
vec = (float*)malloc(ARRAY_SIZE * sizeof(float));
//Initialize the vector of floats
for(i = 0; i < ARRAY_SIZE; i++)
vec[i] = i + 1;
/************************* C Function **************************************/
clock_t start, end;
start = clock();
for( i = 0; i < ARRAY_SIZE; i++)
{
non_parallel[i] = vec[i] * vec[i];
}
end = clock();
printf( "Number of seconds: %f\n", (clock()-start)/(double)CLOCKS_PER_SEC );
free(non_parallel);
/***************************************************************************/
clGetPlatformIDs(1, &platform, NULL);//Just want NVIDIA platform
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
// Context error?
if(err)
{
perror("Cannot create context");
return 1;
}
//Read the kernel file
program_handle = fopen(PROGRAM_FILE,"r");
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
//Create the program
program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer,
&program_size, &err);
if(err)
{
perror("Cannot create program");
return 1;
}
free(program_buffer);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err)
{
perror("Cannot create kernel");
return 1;
}
queue = clCreateCommandQueue(context, device, CL_QUEU_PROFILING_ENABLE, &err);
if(err)
{
perror("Cannot create command queue");
return 1;
}
vec_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, vec, &err);
result_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float)*ARRAY_SIZE, NULL, &err);
if(err)
{
perror("Cannot create the vector buffer");
return 1;
}
clSetKernelArg(kernel, 0, sizeof(cl_mem), &vec_buffer);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &result_buffer);
size_t global_size = ARRAY_SIZE;
size_t local_size = 0;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &prof_event);
clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, sizeof(float)*ARRAY_SIZE, &result, 0, NULL, NULL);
clFinish(queue);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time += time_end - time_start;
printf("\nAverage time in nanoseconds = %lu\n", total_time/ARRAY_SIZE);
clReleaseMemObject(vec_buffer);
clReleaseMemObject(result_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
free(vec);
return 0;
}
And the kernel is:
__kernel void float_sum(__global float* vec,__global float* result){
int gid = get_global_id(0);
result[gid] = vec[gid] * vec[gid];
}
Now, the results are:
Number of seconds: 0.010000 <- This is the for the C code
Average time in nanoseconds = 140737284 <- OpenCL function
0,1407 seconds is the time of the OpenCL time kernel execution, and it's more than the C function, is it correct? Beacause I think OpenCL should be fastest than C non parallel algorithm...
Executing parallel code on the GPU is not necessarily faster that executing on the CPU. Take into account that you also have to transfer the data to and from the GPU memory in addition to the computations.
In your example you are transferring 2 * N items and doing an O(N) operation in parallel, which is a very inefficient use of the GPU. Therefore, it's quite likely that the CPU is indeed faster for this particular computation.