Search code examples
gpuprofilingopenclsycldpc++

How to measure execution time of code in device+OpenCL+GPU


I try to measure the execution time of my code on CPU and GPU. for measuring the time on CPU, I used std::chrono::high_resolution_clock::now() and std::chrono::high_resolution_clock::now(), std::chrono::duration_caststd::chrono::nanoseconds(end - begin) and for measuring the time on GPU device, I read these links: 1- https://github.com/intel/pti-gpu/blob/master/chapters/device_activity_tracing/OpenCL.md 2- https://docs.oneapi.com/versions/latest/dpcpp/iface/event.html 3- https://developer.codeplay.com/products/computecpp/ce/guides/computecpp-profiler/step-by-step-profiler-guide?version=2.2.1 and so on so for... The problem is that, I confused and I can not understand how can I measure the execution time of code on GPU with using profiling. I do not know even where should I put in my code and I did lots of mistake. my code is:

#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include <CL/opencl.h>

using namespace tbb;

template<class Tin, class Tout, class Function>
class Map {
private:
    Function fun;
public:
    Map() {}
    Map(Function f):fun(f) {}

    std::vector<Tout> operator()(bool use_tbb, std::vector<Tin>& v) {
        std::vector<Tout> r(v.size());
        if(use_tbb) {
            // Start measuring time
            auto begin = std::chrono::high_resolution_clock::now();
            tbb::parallel_for(tbb::blocked_range<Tin>(0, v.size()),
                [&](tbb::blocked_range<Tin> t) {
                for(int index = t.begin(); index < t.end(); ++index) {
                    r[index] = fun(v[index]);
                }
            });
            // Stop measuring time and calculate the elapsed time
            auto end = std::chrono::high_resolution_clock::now();
            auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
            printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
            return r;
        } else {
            sycl::queue gpuQueue { sycl::gpu_selector() };
            sycl::range<1> n_item { v.size() };
            sycl::buffer<Tin, 1> in_buffer(&v[0], n_item);
            sycl::buffer<Tout, 1> out_buffer(&r[0], n_item);

            //Try To use Profiling to measure the execution time of code on GPU device!
            cl_queue_properties props[3] = { CL_QUEUE_PROPERTIES,
                                             CL_QUEUE_PROFILING_ENABLE, 0 };
            cl_int status = CL_SUCCESS;
            cl_command_queue queue =clCreateCommandQueueWithProperties(context, device, props, &status);
            assert(status == CL_SUCCESS);
            cl_event event = nullptr;
            status = clEnqueueNDRangeKernel(queue, kernel, dim, nullptr,
                global_size, local_size, 0, nullptr, &event);
            assert(status == CL_SUCESS);
            status = clSetEventCallback(event, CL_COMPLETE, EventNotify, nullptr);
            assert(status == CL_SUCESS);
            void CL_CALLBACK EventNotify(cl_event event,
                cl_int event_status,
                void* user_data) {
                cl_int status = CL_SUCCESS;
                cl_ulong start = 0, end = 0;

                assert(event_status == CL_COMPLETE);

                status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
                    sizeof(cl_ulong), &start, nullptr);
                assert(status == CL_SUCCESS);
                status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
                    sizeof(cl_ulong), &end, nullptr);
                assert(status == CL_SUCCESS);
            }

            gpuQueue.submit([&](sycl::handler& h) {
                //local copy of fun
                auto f = fun;
                sycl::accessor in_accessor(in_buffer, h, sycl::read_only);
                sycl::accessor out_accessor(out_buffer, h, sycl::write_only);
                h.parallel_for(n_item, [=](sycl::id<1> index) {
                    out_accessor[index] = f(in_accessor[index]);
                });
            }).wait();
        }
        return r;
    }
};

template<class Tin, class Tout, class Function>
Map<Tin, Tout, Function> make_map(Function f) {
    return Map<Tin, Tout, Function>(f);
}

//typedef int(*func)(int x);
//define different functions
/*
auto function = [](int x){ return x; };
auto functionTimesTwo = [](int x){ return (x*2); };
auto functionDivideByTwo = [](int x){ return (x/2); };
auto lambdaFunction = [](int x){return (++x);};
*/

int main(int argc, char *argv[]) {
    std::vector<int> v = { 1,2,3,4,5,6,7,8,9 };
    auto f = [](int x) {return (++x); };
    //Array of functions
    /*func functions[] =
        {
            function,
            functionTimesTwo,
            functionDivideByTwo,
            lambdaFunction
        };

    for(int i = 0; i< sizeof(functions); i++){
        auto m1 = make_map<int, int>(functions[i]);
*/
    auto m1 = make_map<int, int>(f);
    std::vector<int> r = m1(true, v);
    //print the result
    for(auto &e:r) {
        std::cout << e << " ";
    }
    return 0;
}

Solution

  • A good start is to format your code so you have consistent indentation. I have done that for you here. If you are using Visual Studio Community, select the text and press Ctrl+K and then Ctrl+F.

    Now to the profiling. Here is a simple Clock class that is easy to use for profiling:

    #include <chrono>
    class Clock {
    private:
        typedef chrono::high_resolution_clock clock;
        chrono::time_point<clock> t;
    public:
        Clock() { start(); }
        void start() { t = clock::now(); }
        double stop() const { return chrono::duration_cast<chrono::duration<double>>(clock::now()-t).count(); }
    };
    

    With that prepared, here is how you profile your GPU kernel:

    Clock clock;
    clock.start();
    status = clEnqueueNDRangeKernel(queue, kernel, dim, nullptr, global_size, local_size, 0, nullptr, &event);
    clFinish(queue); // important: this will block until kernel execution is finished
    std::cout << "Execution time: " << clock.stop() << " seconds" << std::endl;
    

    To get a more accurate measurement, you can measure kernel execution time multiple times in a loop and calculate the average.