thrust::sort_by_key() does not work for zip_iterator value

I am trying to use sort_by_key() to sort key-value pairs, where the value is a zip_iterator. But using this, only the keys are being sorted and not the value.

Here is my example code

#include <stdio.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/tuple.h>
#include <thrust/device_vector.h>

typedef thrust::device_vector<int>::iterator W_itr;
typedef thrust::tuple<W_itr, W_itr> W_itr_tuple;
typedef thrust::tuple<int, int> W_tuple;
struct Min_op
{
    __host__ __device__ 
    W_tuple operator()(const W_tuple& a, const W_tuple& b) const
    {
        int aw = thrust::get<0>(a), bw = thrust::get<0>(b);
        return aw < bw ? a : b;
    }
};

void printArray(int* a, int size)
{
    printf("[ %d", a[0]);
    for (int i = 1;i < size;i++)
    {
        printf(",\t%d", a[i]);
    }
    printf(" ]");
}


int main()
{
    int a[5] = { 5, 4, 1, 4, 4 },
        b[5] = { 1, 4, 2, 5, 6 },
        c[5] = { 10, 11, 12,13,14 };

    int* da, * db, * dc, * da_copy, * db_copy, * dc_copy;
    int size = sizeof(int) * 5;
    cudaMalloc(&da, size);
    cudaMalloc(&db, size);
    cudaMalloc(&dc, size);
    cudaMalloc(&da_copy, size);
    cudaMalloc(&db_copy, size);
    cudaMalloc(&dc_copy, size);
    cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice);

    thrust::device_ptr<int> da_ptr(da), db_ptr(db), dc_ptr(dc), db_copy_ptr(db_copy), dc_copy_ptr(dc_copy);
    thrust::device_vector<int> b_vec(db, db + 5), c_vec(dc, dc + 5), b_vec_copy(db_copy_ptr, db_copy_ptr+5), c_vec_copy(dc_copy_ptr, dc_copy_ptr+5);

    thrust::zip_iterator<W_itr_tuple> zip1(thrust::make_tuple(b_vec.begin(), c_vec.begin())), 
        zip2(thrust::make_tuple(b_vec_copy.begin(), c_vec_copy.begin())); // Question: Any easier way to make a zip_iterator with only raw pointers on device?

    thrust::sort_by_key(da, da + 5, zip1);


    cudaMemcpy(a, da, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, db, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);

    printf("After Sort\n a = ");
    printArray(a, 5);
    printf("\n b = ");
    printArray(b, 5); // Question: Order has not changed at all. How to solve this?
    printf("\n c = ");
    printArray(c, 5); // Question: Order has not changed at all. How to solve this?

    // auto result = thrust::reduce_by_key
    // (
    //  da, da + 5, 
    //  zip1, 
    //  da_copy,
    //  zip2, 
    //  thrust::equal_to<int>(), 
    //  Min_op()
    // );
    // size = (result.first - da_copy) * sizeof(int); // Question: I am unable to get this compiled. Basically I want to use the zip_iterator to reduce using my custom operator. How to get this done?

    cudaMemcpy(a, da_copy, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, db_copy, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(c, dc_copy, size, cudaMemcpyDeviceToHost);

    printf("\n\nAfter Reduce\n a = ");
    printArray(a, 5);
    printf("\n b = ");
    printArray(b, 5);
    printf("\n c = ");
    printArray(b, 5);

    return 0;
}

And the output that I obtain here is

After Sort
 a = [ 1,       4,      4,      4,      5 ]
 b = [ 1,       4,      2,      5,      6 ]
 c = [ 10,      11,     12,     13,     14 ]

...

As you can see, a is sorted correctly, but the b and c do not change as I expected them to.

I have put my queries in comments, which are the following:

Assuming that my implementation is wrong, what is the correct/best way to declare a zip_iterator from raw device pointers?
How to get the zip_iteration sorted?
How to use the reduce_by_key() function as I intend to? If I uncomment my code, it does not compile?

Solution

There are three main problems with your code:

Vectors are owning containers, so thrust::device_vector<int> b_vec(db, db + 5) generates a copy, on which you work. Later when transferring the result to the host, you don't use this copy, but the original, unsorted data instead.
The types of fancy iterators are often complicated and easy to screw up. Use factory functions like thrust::make_zip_iterator and auto instead to create easier to read code with less pitfalls.
When using Thrust functionality you need to consistently use the wrapped thrust::device_ptr variables instead of the raw pointers. Not doing so can cause a dispatch to the CPU which will then cause runtime errors due to the pointers pointing to inaccessible device memory.

In the following you can find the fixed code (+ error checking and some stylistic C++ defaults):

#include <cstdio>

#include <thrust/device_ptr.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/tuple.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}



struct Min_op
{
    using W_tuple = thrust::tuple<int, int>;

    __host__ __device__ 
    W_tuple operator()(const W_tuple& a, const W_tuple& b) const
    {
        int aw = thrust::get<0>(a);
        int bw = thrust::get<0>(b);
        return aw < bw ? a : b;
    }
};

void printArray(int* a, int size)
{
    std::printf("[ %d", a[0]);
    for (int i = 1;i < size;i++)
    {
        std::printf(",\t%d", a[i]);
    }
    std::printf(" ]");
}

int main()
{
    int a[] = { 5, 4, 1, 4, 4 };
    int b[] = { 1, 4, 2, 5, 6 };
    int c[] = { 10, 11, 12, 13, 14 };
    
    int* da{}; int* db{}; int* dc{};
    int* da_copy{};
    int* db_copy{};
    int* dc_copy{};

    static_assert(sizeof(a) == sizeof(b) && sizeof(b) == sizeof(c));
    constexpr int size = sizeof(a);
    constexpr int n_elements = size / sizeof(a[0]);

    gpuErrchk(cudaMalloc(&da, size));
    gpuErrchk(cudaMalloc(&db, size));
    gpuErrchk(cudaMalloc(&dc, size));
    gpuErrchk(cudaMalloc(&da_copy, size));
    gpuErrchk(cudaMalloc(&db_copy, size));
    gpuErrchk(cudaMalloc(&dc_copy, size));

    gpuErrchk(cudaMemcpy(da, a, size, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(db, b, size, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice));

    using dptr = thrust::device_ptr<int>;
    dptr da_ptr{da}; dptr db_ptr{db}; dptr dc_ptr{dc};
    dptr da_copy_ptr{da_copy};
    dptr db_copy_ptr{db_copy};
    dptr dc_copy_ptr{dc_copy};

    auto zip1 = thrust::make_zip_iterator
    (
      thrust::make_tuple(db_ptr, dc_ptr)
    );
    auto zip2 = thrust::make_zip_iterator
    (
      thrust::make_tuple(db_copy_ptr, dc_copy_ptr)
    );

    thrust::sort_by_key(da, da + n_elements, zip1);

    gpuErrchk(cudaMemcpy(a, da, size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(b, db, size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost));

    std::printf("After Sort\n a = ");
    printArray(a, n_elements);
    std::printf("\n b = ");
    printArray(b, n_elements);
    std::printf("\n c = ");
    printArray(c, n_elements);

    auto result = thrust::reduce_by_key
    (
      da_ptr, da_ptr + n_elements, 
      zip1, 
      da_copy_ptr,
      zip2, 
      thrust::equal_to<int>(), 
      Min_op()
    );
    auto new_n_elements = thrust::distance(da_copy_ptr, result.first);
    auto new_size = new_n_elements * sizeof(a[0]);

    gpuErrchk(cudaMemcpy(a, da_copy, new_size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(b, db_copy, new_size, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(c, dc_copy, new_size, cudaMemcpyDeviceToHost));

    std::printf("\n\nAfter Reduce\n a = ");
    printArray(a, new_n_elements);
    std::printf("\n b = ");
    printArray(b, new_n_elements);
    std::printf("\n c = ");
    printArray(b, new_n_elements);

    return 0;
}