I read this link https://github.com/intel/pti-gpu
and I tried to use Device Activity Tracing for OpenCL(TM), but I am confused and I do not know how should I measure the time on the accelerators with using Device Activity documentation. for measuring the performance of CPU I used chrono, but I am interested in to using profiling for measuring the performance of CPU and GPU in different devices. my program:
#include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
using namespace tbb;
template<class Tin, class Tout, class Function>
class Map {
private:
Function fun;
public:
Map() {}
Map(Function f):fun(f) {}
std::vector<Tout> operator()(bool use_tbb, std::vector<Tin>& v) {
std::vector<Tout> r(v.size());
if(use_tbb){
// Start measuring time
auto begin = std::chrono::high_resolution_clock::now();
tbb::parallel_for(tbb::blocked_range<Tin>(0, v.size()),
[&](tbb::blocked_range<Tin> t) {
for (int index = t.begin(); index < t.end(); ++index){
r[index] = fun(v[index]);
}
});
// Stop measuring time and calculate the elapsed time
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
return r;
} else {
sycl::queue gpuQueue{sycl::gpu_selector()};
sycl::range<1> n_item{v.size()};
sycl::buffer<Tin, 1> in_buffer(&v[0], n_item);
sycl::buffer<Tout, 1> out_buffer(&r[0], n_item);
gpuQueue.submit([&](sycl::handler& h){
//local copy of fun
auto f = fun;
sycl::accessor in_accessor(in_buffer, h, sycl::read_only);
sycl::accessor out_accessor(out_buffer, h, sycl::write_only);
h.parallel_for(n_item, [=](sycl::id<1> index) {
out_accessor[index] = f(in_accessor[index]);
});
}).wait();
}
return r;
}
};
template<class Tin, class Tout, class Function>
Map<Tin, Tout, Function> make_map(Function f) { return Map<Tin, Tout, Function>(f);}
typedef int(*func)(int x);
//define different functions
auto function = [](int x){ return x; };
auto functionTimesTwo = [](int x){ return (x*2); };
auto functionDivideByTwo = [](int x){ return (x/2); };
auto lambdaFunction = [](int x){return (++x);};
int main(int argc, char *argv[]) {
std::vector<int> v = {1,2,3,4,5,6,7,8,9};
//auto f = [](int x){return (++x);};
//Array of functions
func functions[] =
{
function,
functionTimesTwo,
functionDivideByTwo,
lambdaFunction
};
for(int i = 0; i< sizeof(functions); i++){
auto m1 = make_map<int, int>(functions[i]);
//auto m1 = make_map<int, int>(f);
std::vector<int> r = m1(true, v);
//print the result
for(auto &e:r) {
std::cout << e << " ";
}
}
return 0;
}
First of all, SYCL Kernel wont support function pointers. So, you can change the code accordingly.
One way to achieve profiling in GPU is by following the steps below: 1.Enable profiling mode for the command queue of the target device 2.Introduce the event for the target device activity 3.Set the callback to be notified when the activity is completed 4.Read the profiling data inside the callback
Basically, you need to use CL_PROFILING_COMMAND_START and CL_PROFILING_COMMAND_END (command identified by event start and end execution on the device) inside the call back.
You can find the detailed steps here https://github.com/intel/pti-gpu/blob/master/chapters/device_activity_tracing/OpenCL.md
I would also advice you to check the samples for pti-gpu using Device Activity Tracing. Check the URL for the same https://github.com/intel/pti-gpu/tree/master/samples