I have two index sets, one in the range [0, N], one in the range [0, M], where N != M. The indices are used to refer to values in different thrust::device_vector
s.
Essentially, I want to create one GPU thread for every combination of these indices, so N*M threads. Each thread should compute a value based on the index-combination and store the result in another thrust::device_vector
, at a unique index also based on the input combination.
This seems to be a fairly standard problem, but I was unable to find a way to do this in thrust. The documentation only ever mentions problems, where element i of a vector needs to compute something with element i of another vector. There is the thrust::permutation_iterator
, but as far as I understand it only gives me the option to reorder data, and I have to specify the order as well.
Some code:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
int main()
{
// Initialize some data
const int N = 2;
const int M = 3;
thrust::host_vector<int> vec1_host(N);
thrust::host_vector<int> vec2_host(M);
vec1_host[0] = 1;
vec1_host[1] = 5;
vec2_host[0] = -3;
vec2_host[1] = 42;
vec2_host[2] = 9;
// Copy to device
thrust::device_vector<int> vec1_dev = vec1_host;
thrust::device_vector<int> vec2_dev = vec2_host;
// Allocate device memory to copy results to
thrust::device_vector<int> result_dev(vec1_host.size() * vec2_host.size());
// Create functor I want to call on every combination
struct myFunctor
{
thrust::device_vector<int> const& m_vec1;
thrust::device_vector<int> const& m_vec2;
thrust::device_vector<int>& m_result;
myFunctor(thrust::device_vector<int> const& vec1, thrust::device_vector<int> const& vec2, thrust::device_vector<int>& result)
: m_vec1(vec1), m_vec2(vec2), m_result(result)
{
}
__host__ __device__
void operator()(size_t i, size_t j) const
{
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec1[j];
}
} func(vec1_dev, vec2_dev, result_dev);
// How do I create N*M threads, each of which calls func(i, j) ?
// Copy results back
thrust::host_vector<int> result_host = result_dev;
for(int i : result_host)
std::cout << i << ", ";
std::cout << std::endl;
// Expected output:
// -2, 2, 43, 47, 10, 14
return 0;
}
I'm fairly sure this is very easy to achieve, I guess I'm just missing the right search terms. Anyways, all help appreciated :)
Presumably in your functor operator instead of this:
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec1[j];
^ ^
you meant this:
m_result[i + j * m_vec1.size()] = m_vec1[i] + m_vec2[j];
^ ^
I think there are probably many ways to tackle this, but so as to not argue about things that are not germane to the question, I'll try and stay as close to your given code as I can.
Operations like []
on a vector are not possible in device code. Therefore we must convert your functor to work on raw data pointers, rather than thrust vector operations directly.
With those caveats, and a slight modification in how we handle your i
and j
indices, I think what you're asking is not difficult.
The basic strategy is to create a result vector that is of length N*M
just as you suggest, then create the indices i
and j
within the functor operator. In so doing, we need only pass one index to the functor, using e.g. thrust::transform
or thrust::for_each
to create our output:
$ cat t79.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/execution_policy.h>
#include <iostream>
struct myFunctor
{
const int *m_vec1;
const int *m_vec2;
int *m_result;
size_t v1size;
myFunctor(thrust::device_vector<int> const& vec1, thrust::device_vector<int> const& vec2, thrust::device_vector<int>& result)
{
m_vec1 = thrust::raw_pointer_cast(vec1.data());
m_vec2 = thrust::raw_pointer_cast(vec2.data());
m_result = thrust::raw_pointer_cast(result.data());
v1size = vec1.size();
}
__host__ __device__
void operator()(const size_t x) const
{
size_t i = x%v1size;
size_t j = x/v1size;
m_result[i + j * v1size] = m_vec1[i] + m_vec2[j];
}
};
int main()
{
// Initialize some data
const int N = 2;
const int M = 3;
thrust::host_vector<int> vec1_host(N);
thrust::host_vector<int> vec2_host(M);
vec1_host[0] = 1;
vec1_host[1] = 5;
vec2_host[0] = -3;
vec2_host[1] = 42;
vec2_host[2] = 9;
// Copy to device
thrust::device_vector<int> vec1_dev = vec1_host;
thrust::device_vector<int> vec2_dev = vec2_host;
// Allocate device memory to copy results to
thrust::device_vector<int> result_dev(vec1_host.size() * vec2_host.size());
// How do I create N*M threads, each of which calls func(i, j) ?
thrust::for_each_n(thrust::device, thrust::counting_iterator<size_t>(0), (N*M), myFunctor(vec1_dev, vec2_dev, result_dev));
// Copy results back
thrust::host_vector<int> result_host = result_dev;
for(int i : result_host)
std::cout << i << ", ";
std::cout << std::endl;
// Expected output:
// -2, 2, 43, 47, 10, 14
return 0;
}
$ nvcc -std=c++11 -arch=sm_61 -o t79 t79.cu
$ ./t79
-2, 2, 43, 47, 10, 14,
$
In retrospect, I think this is more or less exactly what @eg0x20 was suggesting.