Given two key-value lists, I am trying to combine the two sides by matching the keys and applying a function to the two values when the keys match. In my case I want to multiply the values. A small example to make it more clear:
Left keys: { 1, 2, 4, 5, 6 }
Left values: { 3, 4, 1, 2, 1 }
Right keys: { 1, 3, 4, 5, 6, 7 };
Right values: { 2, 1, 1, 4, 1, 2 };
Expected output keys: { 1, 4, 5, 6 }
Expected output values: { 6, 1, 8, 1 }
I have been able to implement this on the CPU using C++ using the next code:
int main() {
int leftKeys[5] = { 1, 2, 4, 5, 6 };
int leftValues[5] = { 3, 4, 1, 2, 1 };
int rightKeys[6] = { 1, 3, 4, 5, 6, 7 };
int rightValues[6] = { 2, 1, 1, 4, 1, 2 };
int leftIndex = 0, rightIndex = 0;
std::vector<std::tuple<int, int>> result;
while (leftIndex < 5 && rightIndex < 6) {
if (leftKeys[leftIndex] < rightKeys[rightIndex]) {
leftIndex++;
}
if (leftKeys[leftIndex] > rightKeys[rightIndex]) {
rightIndex++;
}
result.push_back(std::make_tuple(leftKeys[leftIndex], leftValues[leftIndex] * rightValues[rightIndex]));
leftIndex++;
rightIndex++;
}
// Print results
for (int i = 0; i < result.size(); i++) {
std::cout << "Key: " << std::get<0>(result[i]) << "; Value: " << std::get<1>(result[i]) << "\n";
}
}
However, I have the input keys and values in Thrust's device_vector
s and I need the results on the GPU as well. Therefore it would be more efficient if I did not need to copy all inputs to the host and all outputs back to the device.
The problem is that I cannot find a Thrust function that can be used to combine two lists using a set of keys (and apply a function to both values). Does such a function exist or is there an easy way to implement it myself of should I just do this on the host?
Update:
The following assumptions can be made about the input:
Update 2:
While implementing the second approach in @Robert's answer I get stuck at the transformation. My code so far is below:
struct multiply_transformation : public thrust::binary_function<std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>>
{
__host__ __device__
thrust::tuple<int, int> operator()(thrust::tuple<int, int> d_left, thrust::tuple<int, int> d_right)
{
if (thrust::get<0>(d_left) == thrust::get<0>(d_right)) {
return thrust::make_tuple(thrust::get<0>(d_left), thrust::get<1>(d_left) * thrust::get<1>(d_right));
}
return thrust::make_tuple(-1, -1);
}
};
thrust::device_vector<int> d_mergedKeys(h_leftCount + h_rightCount);
thrust::device_vector<int> d_mergedValues(h_leftCount + h_rightCount);
thrust::merge_by_key(d_leftCountKeys.begin(), d_leftCountKeys.begin() + h_leftCount,
d_rightCountKeys.begin(), d_rightCountKeys.begin() + h_rightCount,
d_leftCounts.begin(), d_rightCounts.begin(), d_mergedKeys.begin(), d_mergedValues.begin());
typedef thrust::tuple<int, int> IntTuple;
thrust::zip_iterator<IntTuple> d_zippedCounts(thrust::make_tuple(d_mergedKeys.begin(), d_mergedValues.begin()));
thrust::zip_iterator<IntTuple> d_zippedCountsOffset(d_zippedCounts + 1);
multiply_transformation transformOperator;
thrust::device_vector<IntTuple> d_transformedResult(h_leftCount + h_rightCount);
thrust::transform(d_zippedCounts, d_zippedCounts + h_leftCount + h_rightCount - 1, d_zippedCountsOffset, d_transformedResult.begin(), transformOperator);
However, I get the error that no overloaded function thrust::transform
matches the argument list. In the above code h_leftCount
and h_rightCount
are the sizes of the left and right inputs. d_leftCountKeys
, d_rightCountKeys
, d_leftCounts
, and d_rightCounts
are thrust::device_vector<int>
.
Well, I'm not sure this is the best method (@m.s. usually comes up with better approaches than I), but one possible approach would be (method 1):
I don't know what your skill level is with thrust but I can provide a trivial worked example of the above if desired.
Another possible approach (method 2):
My sense is the second method might be faster, but I haven't carefully thought through it. In any event, it's better to benchmark test cases than to work off of (my) intuition.
Based on a comment below, here is a description of what is happening starting with the 2nd step of method 2, using your example dataset:
The output of step 1 (the merge_by_key operation) would look like something like this:
keys: { 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values: { 3, 2, 4, 1, 1, 1, 2, 4, 1, 1, 2 };
Let's construct two versions, the first being "the item" and the second being "the next item to the right":
keys1: { 1, 1, 2, 3, 4, 4, 5, 5, 6, 6 };
values1: { 3, 2, 4, 1, 1, 1, 2, 4, 1, 1 };
keys2: { 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values2: { 2, 4, 1, 1, 1, 2, 4, 1, 1, 2 };
The actual "construction" is trivial. keys1 is just [keys.begin(), keys.end()-1), and keys2 is just [keys.begin()+1, keys.end()). And likewise for values1 and values2.
We'll zip keys1 and values1 together and we'll zip keys2 and values2 together. Then we'll pass these two zipped entities to a transform operation that has a special functor that will do the following:
If keys1 == keys2, do the desired math operation on the values1 and values2 quantities, and place a one in the marker array. If not, place a 0 in a marker array. The output of this operation would be:
keys: { 1, 2, 3, 4, 4, 5, 5, 6, 6, 7 };
values: { 6, 4, 1, 1, 1, 8, 4, 1, 1, 2 };
marker: { 1, 0, 0, 1, 0, 1, 0, 1, 0, 0 };
Now zip the 3 vectors above together, and pass that to remove_if. The remove_if functor would indicate removal of any items for which marker == 0, leaving:
keys: { 1, 4, 5, 6 };
values: { 6, 1, 8, 1 };
marker: { 1, 1, 1, 1 };
Here is a fully worked example demonstrating both methods:
$ cat t1007.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/set_operations.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/merge.h>
#include <thrust/remove.h>
#include <assert.h>
struct mark_mpy_func
{
template <typename T1, typename T2>
__host__ __device__
int operator()(T1 &z1, T2 &z2){
int res = 0;
if (thrust::get<0>(z1) == thrust::get<0>(z2)){
res = thrust::get<1>(z1) * thrust::get<1>(z2);
thrust::get<2>(z1) = 1;}
return res;
}
};
struct mtest_func
{
__host__ __device__
bool operator()(int t){
if (t == 0) return true;
return false;
}
};
int main(){
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
// method 1
thrust::device_vector<int> Lkeysvo(Lsize);
thrust::device_vector<int> Lvalsvo(Lsize);
thrust::device_vector<int> Rkeysvo(Rsize);
thrust::device_vector<int> Rvalsvo(Rsize);
size_t Lsizeo = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Lkeysvo.begin(), Lvalsvo.begin()).first - Lkeysvo.begin();
size_t Rsizeo = thrust::set_intersection_by_key(Rkeysv.begin(), Rkeysv.end(), Lkeysv.begin(), Lkeysv.end(), Rvalsv.begin(), Rkeysvo.begin(), Rvalsvo.begin()).first - Rkeysvo.begin();
assert(Lsizeo == Rsizeo);
thrust::device_vector<int> res1(Lsizeo);
thrust::transform(Lvalsvo.begin(), Lvalsvo.begin()+Lsizeo, Rvalsvo.begin(), res1.begin(), thrust::multiplies<int>());
std::cout << "Method 1 result:" << std::endl << "keys: ";
thrust::copy_n(Lkeysvo.begin(), Lsizeo, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res1.begin(), Lsizeo, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
// method 2
thrust::device_vector<int> Mkeysv(Lsize + Rsize);
thrust::device_vector<int> Mvalsv(Lsize + Rsize);
thrust::merge_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Rvalsv.begin(), Mkeysv.begin(), Mvalsv.begin());
thrust::device_vector<int> marker(Lsize + Rsize - 1);
thrust::device_vector<int> res2(Lsize + Rsize - 1);
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), Mvalsv.begin(), marker.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, Mvalsv.end()-1, marker.end())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin()+1, Mvalsv.begin()+1)), res2.begin(), mark_mpy_func());
size_t rsize2 = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple( Mkeysv.begin(), res2.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, res2.end())), marker.begin(), mtest_func()) - thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), res2.begin()));
std::cout << "Method 2 result:" << std::endl << "keys: ";
thrust::copy_n(Mkeysv.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res2.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1007 t1007.cu
$ ./t1007
Method 1 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
Method 2 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
$
If it is acceptable to use a marker value (say, -1) in the output data to inform the remove_if operation, then the separate marker array can be dispensed with. Here's a modified version of method 2 that works this way:
$ cat t1007.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/merge.h>
#include <thrust/remove.h>
#define MARK_VAL -1
struct mark_mpy_func
{
template <typename T1, typename T2>
__host__ __device__
int operator()(T1 &z1, T2 &z2){
int res = MARK_VAL;
if (thrust::get<0>(z1) == thrust::get<0>(z2)){
res = thrust::get<1>(z1) * thrust::get<1>(z2);}
return res;
}
};
struct mtest_func
{
template <typename T>
__host__ __device__
bool operator()(T t){
if (thrust::get<1>(t) == MARK_VAL) return true;
return false;
}
};
int main(){
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector<int> Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector<int> Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector<int> Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector<int> Rvalsv(Rvals, Rvals+Rsize);
// method 2
thrust::device_vector<int> Mkeysv(Lsize + Rsize);
thrust::device_vector<int> Mvalsv(Lsize + Rsize);
thrust::merge_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), Lvalsv.begin(), Rvalsv.begin(), Mkeysv.begin(), Mvalsv.begin());
thrust::device_vector<int> res2(Lsize + Rsize - 1);
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), Mvalsv.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, Mvalsv.end()-1)), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin()+1, Mvalsv.begin()+1)), res2.begin(), mark_mpy_func());
size_t rsize2 = thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple( Mkeysv.begin(), res2.begin())), thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.end()-1, res2.end())), mtest_func()) - thrust::make_zip_iterator(thrust::make_tuple(Mkeysv.begin(), res2.begin()));
std::cout << "Method 2 result:" << std::endl << "keys: ";
thrust::copy_n(Mkeysv.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl << "vals: ";
thrust::copy_n(res2.begin(), rsize2, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1007 t1007.cu
$ ./t1007
Method 2 result:
keys: 1,4,5,6,
vals: 6,1,8,1,
$