I am trying to access a data structure with multiple levels of indirection on the GPU. The example hierarchy that I have now is A contains B contains C. Each contains data. A contains a pointer to B, B contains a pointer to C. When a heap-allocated data structure containing accessors is deallocated, the SYCL implementation segfaults in the accessors' destructors. When BView is destroyed, the segfault occurs.
I am using the ComputeCPP implementation for Ubuntu. This seems like a bug in the runtime because the buffers associated with the accessors in BView
are still valid at the time of BView's destruction. There are not any other errors thrown. I also tried leaking BView
to bypass the error. However, since BView's accessors hold a reference to the buffers for B and C, a deadlock results. This further indicates that the buffers referenced by BView's accessors are valid. Is it a violation of the SYCL spec to heap-allocate accessors or buffers? Maybe this could be causing the issues, since AView
deallocates without any problems.
#include "SYCL/sycl.hpp"
#include <vector>
#include <utility>
#include <iostream>
#include <memory>
struct C {
int m_cData;
C() : m_cData(0) {}
~C() {
std::cout << "C deallocating" << std::endl;
}
};
struct B {
int m_bData;
std::shared_ptr<C> m_c;
B() : m_bData(0), m_c(std::make_shared<C>()) {}
~B() {
std::cout << "B deallocating" << std::endl;
}
};
struct BBuff {
cl::sycl::buffer<B> m_bBuff;
cl::sycl::buffer<C> m_cBuff;
BBuff(const std::shared_ptr<B>& b) : m_bBuff(b, cl::sycl::range<1>(1)),
m_cBuff(b->m_c, cl::sycl::range<1>(1)) {}
~BBuff() {
std::cout << "BBuff deallocating" << std::endl;
}
};
template<cl::sycl::access::target target>
struct BView
{
cl::sycl::accessor<B, 1, cl::sycl::access::mode::read_write, target,
cl::sycl::access::placeholder::true_t> m_bDataAcc;
cl::sycl::accessor<C, 1, cl::sycl::access::mode::read_write, target,
cl::sycl::access::placeholder::true_t> m_cAcc;
BView(const std::shared_ptr<BBuff>& bBuff) : m_bDataAcc(bBuff->m_bBuff), m_cAcc(bBuff->m_cBuff)
{
}
void RequireForHandler(cl::sycl::handler& cgh) {
cgh.require(m_bDataAcc);
cgh.require(m_cAcc);
}
~BView()
{
std::cout << "BView deallocating" << std::endl;
}
};
struct A {
int m_aData;
std::shared_ptr<B> m_b;
A() : m_aData(0), m_b(std::make_shared<B>()) {}
~A()
{
std::cout << "A deallocating" << std::endl;
}
};
template<cl::sycl::access::target target>
struct ABuff {
cl::sycl::buffer<A> m_aBuff;
std::shared_ptr<BBuff> m_bBuff;
std::shared_ptr<BView<target>> m_bViewBuffData;
std::shared_ptr<cl::sycl::buffer<BView<target>>> m_bViewBuff;
ABuff(const std::shared_ptr<A>& a): m_aBuff(a, cl::sycl::range<1>(1)),
m_bBuff(std::make_shared<BBuff>(a->m_b)) {
m_bViewBuffData = std::make_shared<BView<target>>(m_bBuff);
m_bViewBuff = std::make_shared<cl::sycl::buffer<BView<target>>>(m_bViewBuffData, cl::sycl::range<1>(1));
}
~ABuff()
{
std::cout << "ABuff deallocating" << std::endl;
}
};
template<cl::sycl::access::target target>
struct AView {
cl::sycl::accessor<BView<target>, 1, cl::sycl::access::mode::read_write, target,
cl::sycl::access::placeholder::true_t> m_bAcc;
cl::sycl::accessor<A, 1, cl::sycl::access::mode::read_write, target,
cl::sycl::access::placeholder::true_t> m_aDataAcc;
ABuff<target>* m_aBuff;
AView(ABuff<target>* aBuff): m_bAcc(*aBuff->m_bViewBuff), m_aDataAcc(aBuff->m_aBuff),
m_aBuff(aBuff) {}
void RequireForHandler(cl::sycl::handler& cgh) {
m_aBuff->m_bViewBuffData->RequireForHandler(cgh);
cgh.require(m_bAcc);
cgh.require(m_aDataAcc);
}
};
class init_first_block;
int main(int argc, char** argv)
{
std::shared_ptr<A> a = std::make_shared<A>();
try
{
cl::sycl::queue workQueue;
ABuff<cl::sycl::access::target::global_buffer> aGlobalBuff(a);
AView<cl::sycl::access::target::global_buffer> aAccDevice(&aGlobalBuff);
workQueue.submit([&aAccDevice](cl::sycl::handler &cgh) {
aAccDevice.RequireForHandler(cgh);
cgh.single_task<class init_first_block>([aAccDevice]() {
aAccDevice.m_aDataAcc[0].m_aData = 1;
aAccDevice.m_bAcc[0].m_bDataAcc[0].m_bData = 2;
aAccDevice.m_bAcc[0].m_cAcc[0].m_cData = 3;
});
});
workQueue.wait();
}
catch (...)
{
std::cout << "Failure running nested accessor test" << std::endl;
}
std::cout << "A data: " << a->m_aData << std::endl;
std::cout << "B data: " << a->m_b->m_bData << std::endl;
std::cout << "C data: " << a->m_b->m_c->m_cData << std::endl;
return 0;
}
As mentioned above, there is a segfault when deallocating m_cAcc
in BView
. Here is the stacktrace From the looks of it, the entire memory of the shared_ptr in the accessor to the buffer being accessed (m_cBuff) is invalid (not the memory pointed to, the actual data in the shared_ptr including the count). How can this be? BView
is not deallocated multiple times, copied, moved etc.
I gave a presentation about some experiments in this area long time ago that explains the problem of the different memory views between host and device https://github.com/keryell/ronan/blob/gh-pages/Talks/2016/2016-03-13-PPoPP-SYCL-triSYCL/2016-03-13-PPoPP-SYCL-triSYCL-expose.pdf
More interestingly, Intel has a recent proposal to solve this that you could look at/contribute to: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc