Is it possible to copy a class containing pointers to its internal attribute using SYCL and offload it to the graphics card? Basically, I try to reference members to avoid unnecessary memory usage. I managed to have my example run on local memory but when offloading to a graphics card, problems occurred
Value: 700
Description: an illegal memory access was encountered
Function: operator()
Source Location: /root/intel-llvm-mirror/sycl/plugins/cuda/pi_cuda.cpp:2612
SYCL exception: Native API failed. Native API returns: -999 (Unknown PI error) -999 (Unknown PI error)
Value: 700
Description: an illegal memory access was encountered
Function: wait
Source Location: /root/intel-llvm-mirror/sycl/plugins/cuda/pi_cuda.cpp:653
Value: 700
Description: an illegal memory access was encountered
Function: wait
Source Location: /root/intel-llvm-mirror/sycl/plugins/cuda/pi_cuda.cpp:653
Value: 700
Description: an illegal memory access was encountered
Function: get_next_transfer_stream
Source Location: /root/intel-llvm-mirror/sycl/plugins/cuda/pi_cuda.cpp:513
I understand that this error is related to copying a memory pointer to another device, but given that I've provided an explicit class copy constructor, I would expect the copy to be a deep copy. Does SYCL fundamentally lack support for copying a pointer object to a graphics card? I've attached my code below
#include <array>
#include <iostream>
#include <sycl/sycl.hpp>
#include <vector>
using namespace sycl; // (optional) avoids the need for "sycl::" before SYCL names
typedef struct my1 {
int value = 1;
// Copy constructor that creates a new my1
my1(const my1& other) : value(other.value) {}
// Default constructor
my1() = default;
// Constructor with a value
my1(int v) : value(v) {}
} my1;
typedef struct my2 {
struct my1* a;
struct my1* b;
} my2;
template <std::size_t N>
class MyClass {
MyClass(std::array<my1, N> a, std::array<my1, N> b) {
for (std::size_t i = 0; i < N; i++) {
_a[i].value = a[i].value;
_b[i].value = b[i].value;
_c[i].a = &_a[i];
_c[i].b = &_b[i];
float addValue(int i) const {
if (i >= N || i >= N)
// throw std::runtime_error("Index out of bounds");
return 0;
return (_c[i].a)->value + (_c[i].b)->value;
void modifyAValue(int i, float a) {
if (i >= N)
_a[i].value = a;
void modifyBValue(int i, float b) {
if (i >= N)
_b[i].value = b;
MyClass(const MyClass& other) {
_c = std::array<my2, N>();
for (std::size_t i = 0; i < N; i++) {
_a[i].value = other._a[i].value;
_b[i].value = other._b[i].value;
_c[i].a = &_a[i];
_c[i].b = &_b[i];
std::array<my1, N> _a;
std::array<my1, N> _b;
std::array<my2, N> _c;
template <std::size_t N>
struct sycl::is_device_copyable<MyClass<N>> : std::true_type {};
int main() {
// Create a SYCL queue
sycl::queue myQueue;
// Create an instance of MyClass
int N = 10;
std::array<my1, 10> a{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
std::array<my1, 10> b{10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
MyClass<a.size()> myObject(a, b);
std::vector<int> input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
std::vector<float> result(input.size());
// Create a SYCL buffer to hold the result
sycl::buffer<float, 1> resultBuffer(, sycl::range<1>(input.size()));
sycl::buffer<int, 1> inputBuffer(, sycl::range<1>(input.size()));
sycl::buffer<MyClass<10>> buffer(&myObject, 1);
try {
// Submit a SYCL kernel
myQueue.submit([&](sycl::handler& cgh) {
auto resultAcc = resultBuffer.template get_access<sycl::access::mode::write>(cgh);
auto inputAcc = inputBuffer.template get_access<sycl::access::mode::read>(cgh);
auto myObjectAcc = buffer.template get_access<sycl::access::mode::read>(cgh);
cgh.parallel_for<class MyKernel>(
sycl::range<1>(input.size()), [=](sycl::id<1> idx) {
// Call a member function on the instance of MyClass
resultAcc[idx] = myObjectAcc[0].addValue(inputAcc[idx]);
// Wait for the kernel to complete and get the result
} catch (const sycl::exception& e) {
std::cerr << "SYCL exception: " << e.what() << std::endl;
return 1;
} catch (const std::runtime_error& e) {
std::cerr << "Runtime error: " << e.what() << std::endl;
return 1;
// Print the result (accumulated sum)
float sum = 0.0f;
for (float val : result) {
std::cout << val << std::endl;
sum += val;
std::cout << "Result: " << sum << std::endl;
return 0;
In this example, I have a class that contains three arrays. Two of these arrays store information (my1), and the third one stores references to elements from the two information arrays (my2, which holds two pointers to my1 objects).
std::array<my1, N> _a;
std::array<my1, N> _b;
std::array<my2, N> _c;
As I understand it, errors occur because the kernel is attempting to access host memory. This is unexpected, as the constructor should have reassigned the pointer value after the copy operations.
MyClass(const MyClass& other) {
_c = std::array<my2, N>();
for (std::size_t i = 0; i < N; i++)
_a[i].value = other._a[i].value;
_b[i].value = other._b[i].value;
_c[i].a = &_a[i];
_c[i].b = &_b[i];
I wouldn't mind using USM to allocate device memory, but I would prefer to use pointers or references to access the member variables inside the class. This approach helps avoid unnecessary memory allocation by referencing device memory directly (in short, I'd like to use pointers or references as member variable to access device memory)
After some research, I find it is impossible to emerge into SYCL without using usm or buffer below I have put the example code for this question.
template <std::size_t N>
class MyClass {
MyClass(std::array<int, N> aValue, std::array<int, N> bValue,sycl::queue& queue): _queue(queue) {
_a = sycl::malloc_shared<my1*>(N, _queue);
_b = sycl::malloc_shared<my1*>(N, _queue);
_c = sycl::malloc_shared<my2*>(N, _queue);
for(int i = 0;i<N;i++)
_a[i] = sycl::malloc_shared<my1>( 1, _queue);
_a[i]->value = aValue[i];
_b[i] = sycl::malloc_shared<my1>( 1, _queue);
_b[i]->value = bValue[i];
_c[i] = sycl::malloc_shared<my2>( 1, _queue);
_c[i]->a = _a[i];
_c[i]->b = _b[i];
float addValue(int i) const {
if (i >= N || i >= N)
// throw std::runtime_error("Index out of bounds");
return 0;
return (_c[i]->a)->value + (_c[i]->b)->value;
void modifyAValue(int i, float a) {
if (i >= N)
_a[i]->value = a;
void modifyBValue(int i, float b) {
if (i >= N)
_b[i]->value = b;
MyClass(const MyClass& other) {
for (int i = 0;i<N;i++)
_a[i]->value = other._a[i]->value;
_b[i]->value = other._b[i]->value;
~MyClass() {
for (int i = 0;i<N;i++)
sycl::free(_a[i], _queue);
sycl::free(_b[i], _queue);
sycl::free(_c[i], _queue);
sycl::free(_a, _queue);
sycl::free(_b, _queue);
sycl::free(_c, _queue);
my1** _a;
my1** _b;
my2** _c;
sycl::queue& _queue;