OpenCL kernel works with certain data types but not others

I'm trying to learn OpenCL, and currently I'm practising making different kernels. In my attempt to make a dot product using reduction methods, I ran into an issue I don't understand. When I run my code with int inputs and output, it works fine. When I change all the int types to float types (for the inputs and outputs) it gives me a result that is close but is slightly off. Can anyone figure out why this is or what is causing it?

Here's the host code

#include <stdio.h>
#include <CL/cl.h>
#include <string.h>
#include <stdlib.h>

/*
*   IF ERROR, PRINT OUT OPENCL ERROR CODE
*/
void errorCheck(cl_int error){
    if(error < CL_SUCCESS){
        printf("Failed - OpenCL error #: %d\n", error);
        //exit(1);
    }
}

int main(){

cl_int err;
cl_uint numPlatforms = 0;
cl_uint numDevices = 0;
cl_platform_id * platformIDs = NULL;
cl_device_id * deviceIDs = NULL;
cl_context context = NULL;
cl_command_queue cmdQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;

FILE *program_handle;
char *program_buffer;
size_t program_size;

cl_mem a_buff = NULL;
cl_mem b_buff = NULL;
cl_mem c_buff = NULL;
int * a = NULL;
int * b = NULL;
int * c = NULL;
int correct_dot = 0;
int ELEMENTS = 1024;

const int datasize = sizeof(int)*ELEMENTS;

a = (int *)malloc(datasize);
b = (int *)malloc(datasize);
c = (int *)malloc(datasize);

int i;
for(i = 0; i<ELEMENTS; i++){
    a[i] = i;
    b[i] = i;
    correct_dot += (int)i*i;
}

/*
**RETRIEVE NUMBER OF PLATFORMS
*/
err = clGetPlatformIDs(0, NULL, &numPlatforms);
errorCheck(err);

/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF PLATFORMS
*/
platformIDs = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
err = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
errorCheck(err);

/*
**RETRIEVE NUMBER OF DEVICES
*/
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
errorCheck(err);

/*
**ALLOCATE SPACE FOR/RETRIEVE IDS OF DEVICES
*/
deviceIDs = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
err = clGetDeviceIDs(platformIDs[0], CL_DEVICE_TYPE_GPU, numDevices, deviceIDs, NULL);
errorCheck(err);

/*
**GET INFO ON DEVICE
*/
cl_uint * info;
size_t info_size;

err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, 0, NULL, &info_size);
info = (cl_uint *) malloc(info_size);
err = clGetDeviceInfo(deviceIDs[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, info_size, info, NULL);
errorCheck(err);


/*
**CREATE CONTEXT
*/
context = clCreateContext(NULL, numDevices, deviceIDs, NULL, NULL, &err);
errorCheck(err);

/*
**CREATE COMMAND QUEUE
*/
cmdQueue = clCreateCommandQueue(context, deviceIDs[0], 0, &err);
errorCheck(err);

/*
**CREATE BUFFERS
*/
a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, a, &err);
errorCheck(err);
b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, datasize, b, &err);
errorCheck(err);
c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, datasize, c, &err);
errorCheck(err);

err = clEnqueueWriteBuffer(cmdQueue, a_buff, CL_TRUE, 0, datasize, a, 0, NULL, NULL);
errorCheck(err);
err = clEnqueueWriteBuffer(cmdQueue, b_buff, CL_TRUE, 0, datasize, b, 0, NULL, NULL);
errorCheck(err);

/*
**READ PROGRAM FROM FILE
*/
program_handle = fopen("kernels/mvdot.cl", "r");
if(program_handle == NULL) {
    perror("Couldn't find the program file");
    exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);

/*
**CREATE PROGRAM
*/
program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err);
errorCheck(err);

/*
**BUILD PROGRAM
*/
err = clBuildProgram(program, 1, deviceIDs, NULL, NULL, NULL);
errorCheck(err);
if (err == CL_BUILD_PROGRAM_FAILURE) {
    // Determine the size of the log
    size_t log_size;
    clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
    // Allocate memory for the log
    char *log = (char *) malloc(log_size);
    // Get the log
    clGetProgramBuildInfo(program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
    // Print the log
    printf("%s\n", log);
}

/*
**CREATE KERNEL
*/
kernel = clCreateKernel(program, "mvdot", &err);
errorCheck(err);

/*
**SET UP KERNEL ARGS
*/
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &a_buff);  //input A
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &b_buff); //input B
err |= clSetKernelArg(kernel, 2, datasize, NULL);           //local scratch 
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &c_buff); //output C
err |= clSetKernelArg(kernel, 4, sizeof(cl_int), (void *) &ELEMENTS);   //size
errorCheck(err);

/*
**ENQUEUE KERNEL FOR EXECUTION
*/
const size_t global_work_size = ELEMENTS;
const size_t local_work_size = info[0];
err = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
errorCheck(err);


/*
**READ OUTPUT
*/
err = clEnqueueReadBuffer(cmdQueue, c_buff, CL_TRUE, 0, datasize, c, 0, NULL, NULL);
errorCheck(err);

/*
**CHECK OUTPUT
*/
int our_dot = 0;
printf("C = [");
//elements/info[0] gives # of work groups
for(i = 0; i<ELEMENTS; i++){
    printf(" %d ", c[i]);
    our_dot += c[i];
}
printf("]\n");

printf("Our dot = %d\n", our_dot);
printf("Correct dot = %d\n", correct_dot);

if(our_dot == correct_dot){
    printf("PASSED\n");
} else {
    printf("FAILED\n");
}

/*
**FREE RESOURCES
*/
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);

clReleaseMemObject(a_buff);
clReleaseMemObject(b_buff);
clReleaseMemObject(c_buff);

free(platformIDs);
free(deviceIDs);
free(program_buffer);

free(a);
free(b);
free(c);

}

And here's the kernel code.

__kernel void mvdot(
            __global int* a,
        __global int* b,
            __local int* scratch,
            __global int* c,
        __const int size)
{

    int g_id = get_global_id(0);
    int l_id = get_local_id(0);
    int local_size = get_local_size(0);
    int b_id = get_group_id(0);

    //move the product of the two inputs to scratch
    //if our thread is outside of the range of the inputs
    //pad the scratch with 0
    scratch[l_id] = (g_id < size) ? (a[g_id] * b[g_id]) : 0;

    //Use local memory to compute the sum of the products
    barrier(CLK_LOCAL_MEM_FENCE);
    for(int i = 2; i<=local_size; i*=2) {
        int offset = local_size/i;
        scratch[l_id] = (l_id < offset) ? (scratch[l_id] + scratch[l_id + offset]) : scratch[l_id];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    //Move the final sum for each group to the output
    if(l_id == 0){
        c[b_id] = scratch[0];
    }

}

I'm sure it's something very silly, but I'm just not understanding it.

Solution

There's a race condition inside your local memory reduction, since different work-items read and write the same location on successive loop iterations. You need a barrier on each iteration, and you should also avoid rewriting values when no work is to be performed (since this will conflict with the other work-item that is trying to read the location).

The typical way to write this kind of work-group reduction looks something like this:

//Use local memory to compute the sum of the products
for (int offset = local_size/2; offset > 0; offset/=2) {
    barrier(CLK_LOCAL_MEM_FENCE);
    if (l_id < offset)
      scratch[l_id] = scratch[l_id] + scratch[l_id + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);

If you know the SIMD width of the hardware you are executing on, you can unroll the last few iterations of the loop and execute them without a barrier (synchronisation is no longer required if the work-items are executing in lock-step).

As pointed out by @DarkZeros in the comments, the culprit here is the precision of floating point (although the race condition above should still be dealt with). You can verify this by changing the data-type to double - doing this results in your code passing the test on my GPUs.