Currently I am working on porting a molecular dynamics simulation program, which was written in plain cpu C++, to Cuda. In short, the program initialises a list of atoms, transfers the control to an object of class CCalc
which calculates atomic forces, velocities and positions for 100
(or another number of) iterations, and finally returns to draw the atoms on the screen.
My goal is to have all compute-heavy functions in CCalc
run on the gpu. To prevent having to copy all calculation constants in CCalc
one by one, I decided to copy the whole class to device memory, pointed to by this__d
. Since the drawing function is called from the cpu, the atom list needs to be copied between cpu and gpu every 100
iterations and as such is not a member of CCalc
.
In function CCalc::refreshCellList()
, I want to rearrange atoms__d
(the atom list residing in device memory) such that all atoms in the same cell are grouped together. In other words, atoms__d
needs to be sorted with cellId
as keys.
As I don't want to waste time implementing my own sorting algorithm, I tried using thrust::sort_by_key()
. And here's where I got stuck. The function thrust::sort_by_key()
requires device_ptr
objects as arguments; however I cannot access cellId
since I can only cast this__d
to device_ptr
, which I can't dereference on the cpu.
Is there a way to do this without having to break down the "class on gpu" structure?
Here is (an excerpt of) my code:
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
// here is where I would like to use thrust::sort_by_key()
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
while(true) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
Thank you very much.
In other words, atoms__d needs to be sorted with cellId as keys.
It should be possible to do that, at your indicated point in the refreshCellLists
method. For simplicity, I have chosen to use the raw device pointers directly (although we could easily wrap these raw device pointers in thrust::device_ptr
also) combined with the thrust::device
execution policy. Here is a worked example:
$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
for (int i = 0; i < 100; i++) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
When building thrust codes, especially on windows, I usually make a set of recommendations as summarized here.