Thrust - sorting member arrays of class object on gpu

Currently I am working on porting a molecular dynamics simulation program, which was written in plain cpu C++, to Cuda. In short, the program initialises a list of atoms, transfers the control to an object of class CCalc which calculates atomic forces, velocities and positions for 100 (or another number of) iterations, and finally returns to draw the atoms on the screen.

My goal is to have all compute-heavy functions in CCalc run on the gpu. To prevent having to copy all calculation constants in CCalc one by one, I decided to copy the whole class to device memory, pointed to by this__d. Since the drawing function is called from the cpu, the atom list needs to be copied between cpu and gpu every 100 iterations and as such is not a member of CCalc.

In function CCalc::refreshCellList(), I want to rearrange atoms__d (the atom list residing in device memory) such that all atoms in the same cell are grouped together. In other words, atoms__d needs to be sorted with cellId as keys.

As I don't want to waste time implementing my own sorting algorithm, I tried using thrust::sort_by_key(). And here's where I got stuck. The function thrust::sort_by_key() requires device_ptr objects as arguments; however I cannot access cellId since I can only cast this__d to device_ptr, which I can't dereference on the cpu.

Is there a way to do this without having to break down the "class on gpu" structure?

Here is (an excerpt of) my code:

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>

#define REFRESH_CELL_LISTS 20

struct Atom
{
    float pos[3];
    float vel[3];
    float force[3];
    // others
};

std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;

class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);

class CCalc
{
private:
    CCalc *this__d;
public:
    const int nAtoms = noOfAtoms;
    int *cellId;
    const int nCellX = 4, nCellY = 3;
    // many force calculation constants

    CCalc()
    {
        cudaMalloc((void**)&cellId, nAtoms*sizeof(int));

        // some other stuff

        cudaMalloc((void**)&this__d, sizeof(CCalc));
        cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
    }

    // destructor

    void relaxStructure(int numOfIterations)
    {
        cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
        cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);

        for(int iter = 0; iter < numOfIterations; iter++)
        {
            // stuff
            if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
            // calculate forces; update velocities and positions
        }

        cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
        cudaFree(atom__d);
    }

    // functions for force, velocity and position calculation

    void refreshCellLists()
    {
        makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
        cudaDeviceSynchronize();
        // sort atom__d array using cellId as keys;
        // here is where I would like to use thrust::sort_by_key()
    }
};

__global__ void makeCells(CCalc *C, Atom *A)
{
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    if(index < C->nAtoms)
    {
        // determine cell x, y based on position
        // for now let's use an arbitrary mapping to obtain x, y
        int X = (index * index) % C->nCellX;
        int Y = (index * index) % C->nCellY;

        C->cellId[index] = X + Y * C->nCellX;
    }
}

int main()
{
    cudaSetDevice(0);

    noOfAtoms = 1000; // normally defined by input file
    atom.resize(noOfAtoms);

    // initialise atom positions, velocities and forces

    CCalc calcObject;

    while(true) // as long as we need
    {
        // draw atoms on screen
        calcObject.relaxStructure(100);
    }
}

Thank you very much.

Solution

In other words, atoms__d needs to be sorted with cellId as keys.

It should be possible to do that, at your indicated point in the refreshCellLists method. For simplicity, I have chosen to use the raw device pointers directly (although we could easily wrap these raw device pointers in thrust::device_ptr also) combined with the thrust::device execution policy. Here is a worked example:

$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>

#define REFRESH_CELL_LISTS 20

struct Atom
{
    float pos[3];
    float vel[3];
    float force[3];
    // others
};

std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;

class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);

class CCalc
{
private:
    CCalc *this__d;
public:
    const int nAtoms = noOfAtoms;
    int *cellId;
    const int nCellX = 4, nCellY = 3;
    // many force calculation constants

    CCalc()
    {
        cudaMalloc((void**)&cellId, nAtoms*sizeof(int));

        // some other stuff

        cudaMalloc((void**)&this__d, sizeof(CCalc));
        cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
    }

    // destructor

    void relaxStructure(int numOfIterations)
    {
        cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
        cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);

        for(int iter = 0; iter < numOfIterations; iter++)
        {
            // stuff
            if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
            // calculate forces; update velocities and positions
        }

        cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
        cudaFree(atom__d);
    }

    // functions for force, velocity and position calculation

    void refreshCellLists()
    {
        makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
        cudaDeviceSynchronize();
        // sort atom__d array using cellId as keys;
        thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
    }
};

__global__ void makeCells(CCalc *C, Atom *A)
{
    int index = blockDim.x*blockIdx.x + threadIdx.x;
    if(index < C->nAtoms)
    {
        // determine cell x, y based on position
        // for now let's use an arbitrary mapping to obtain x, y
        int X = (index * index) % C->nCellX;
        int Y = (index * index) % C->nCellY;

        C->cellId[index] = X + Y * C->nCellX;
    }
}

int main()
{
    cudaSetDevice(0);

    noOfAtoms = 1000; // normally defined by input file
    atom.resize(noOfAtoms);

    // initialise atom positions, velocities and forces

    CCalc calcObject;

    for (int i = 0; i < 100; i++) // as long as we need
    {
        // draw atoms on screen
        calcObject.relaxStructure(100);
    }
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

When building thrust codes, especially on windows, I usually make a set of recommendations as summarized here.