Search code examples
c++openclxeon-phi

Kernel doesn't wait for events


I have a problem with my kernel invocation. My code looks like this:

std::vector<cl::Event> events;
...

queue.enqueueWriteBuffer(arrayFirst, CL_FALSE, 0, sizeOfArray, NULL, NULL, &arrayEvent);
events.push_back(arrayEvent);

queue.enqueueWriteBuffer(arraySecond, CL_FALSE, 0, sizeOfArraySecond, this->arraySecond, NULL, &arraySecondEvent);
events.push_back(arraySecondEvent);

kernel(cl::EnqueueArgs(queue, events, cl::NDRange(512), cl::NDRange(128)), arrayFirst, arraySecond);

And when I run it, it doesn't go inside kernel code, but when I change "make_kernel" invocation to this:

kernel(cl::EnqueueArgs(queue, arraySecondEvent, cl::NDRange(512), cl::NDRange(128)), arrayFirst, arraySecond);

It goes inside kernel, but I don't have surety that memory for "arrayFirst" is allocated correctly, I check documentation of OpenCl 1.2 Wrapper and I found that invocation should looks like this:

cl::EnqueueArgs::EnqueueArgs(CommandQueue &queue,
const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global,
NDRange local) //page 42

but when I try to pass an address of an events vector I get compilation error, that there is no suitable method with following arguments..

Error:

error: no instance of constructor "cl::EnqueueArgs::EnqueueArgs" matches the argument list
            argument types are: (cl::CommandQueue, std::vector<cl::Event, std::allocator<cl::Event>> *, cl::NDRange, cl::NDRange)
              valueOfImageKernel(cl::EnqueueArgs(valueOfImageQueue, &events, cl::NDRange(512), cl::NDRange(128)),

Anyone have an idea what I'm doing wrong?


Solution

  • It seems the below line which you don't need is causing you trouble: queue.enqueueWriteBuffer(arrayFirst, CL_FALSE, 0, sizeOfArray, NULL, NULL, &arrayEvent); You need to call to write to a buffer when you actually have something to write there which is not the case here.

    Below the full working example, tested on PHI (just for clarity error handling omitted):

    #include <iostream>
    #include <vector>
    #include <string>
    
    #include <CL/cl.hpp>
    
    int main()
    {
        const char *kernel_str{
            "kernel void k1(global int *data1, global int *data2){"
            " int local_id = get_local_id(0);"
            " data1[local_id] = data2[local_id] + data2[local_id];"
            "}" };
    
        cl_int err = CL_SUCCESS;
    
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        cl::Platform plat;
        for (auto &p : platforms)
        {
            std::vector<cl::Device> devices;
            p.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
            if (!devices.empty())
            {
                plat = p;
                break;
            }
        }
        if (plat() == 0)
        {
            std::cout << "No OpenCL platform found.";
            return -1;
        }
    
        cl_context_properties properties[] =
        { CL_CONTEXT_PLATFORM, (cl_context_properties)(plat)(), 0 };
        cl::Context context(CL_DEVICE_TYPE_ACCELERATOR, properties);
    
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
    
        cl::Program::Sources source(1, std::make_pair(kernel_str, strlen(kernel_str)));
        cl::Program program = cl::Program(context, source);
        err = program.build(devices);
    
        cl::CommandQueue queue(context, devices[0], 0, &err);
    
        size_t sizeOfArray = 512, sizeOfArraySecond = 512;
        std::vector<int> varrayFirst(sizeOfArray);
        std::vector<int> varraySecond(sizeOfArraySecond);
        for (size_t x = 0; x < sizeOfArraySecond; ++x)
            varraySecond[x] = x;
    
        cl::Buffer arrayFirst(context, CL_MEM_WRITE_ONLY, sizeOfArray*sizeof(varrayFirst[0]));
        cl::Buffer arraySecond(context, CL_MEM_READ_ONLY, sizeOfArraySecond*sizeof(varraySecond[0]));
    
        cl::Event arraySecondEvent;
    
        std::vector<cl::Event> events;
        err = queue.enqueueWriteBuffer(arraySecond, CL_FALSE, 0, sizeOfArraySecond*sizeof(varraySecond[0]), &varraySecond[0], NULL, &arraySecondEvent);
        events.push_back(arraySecondEvent);
    
        cl::make_kernel<cl::Buffer&, cl::Buffer&> kernel(program, "k1");
    
        cl::Event ev = kernel(cl::EnqueueArgs(queue, events, cl::NDRange(512), cl::NDRange(128)), arrayFirst, arraySecond);
    
        std::vector<cl::Event> evs(1, ev);
    
        err = queue.enqueueReadBuffer(arrayFirst, CL_TRUE, 0, sizeOfArray*sizeof(varrayFirst[0]), &varrayFirst[0], &evs); // final blocking read
    
        std::cout << "Outputting first 10 values: " << std::endl;
        for (int x = 0; x < 10; ++x)
            std::cout << varrayFirst[x] << ", ";
        std::cout << std::endl;
    
        return 0;
    }
    

    Output:

    Outputting first 10 values:
    0, 2, 4, 6, 8, 10, 12, 14, 16, 18,