I have the following problem (keep in mind that I am fairly new to programming with CUDA).
I have a class called vec3f
that is just like the float3
data type but with overloaded operators, and other vector functions. These functions are prefixed with __device__ __host__
. Then, in my kernel I do a nested for loop over block_x
and block_y
indices and do something like,
//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();
In the above code, oldParticles
is a pointer to a class called particles
that is being passed to the kernel. OldParticles
is actually an underlying pointer of a thrust::device_vector<particle>
(I'm not sure if this has something to do with it). Everything compiles okay but when I run I get the error
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
Thanks for the replies. I think the error had to do with me not allocating room for the arguments being passed to my kernel. Doing the following in my host code fixed this error,
particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );
The kernel call is then,
force_kernel<<< grid,block,sharedMemSize >>>(particle_ptrs[0],particle_ptrs[1],time_step);
The issue that I am having now seems to be that I can't get data copied back to the host from the device. I think this has to do with me not being familiar with Thrust.
I'm doing a series of copies as follows,
//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;
So I guess my question is, can I modify a Thrust device_vector
in a kernel (in this case particle_pters[0]
) save the modification to another Thrust device_vector
in the kernel (in this case particle_pters[1]
) and then once I exit from the kernel, copy that to a host_vector
?
I still can't get this to work. I made a shorter example where I am having the same problem,
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
};
particle(const vec3f& pos, const vec3f& vel) :
_position(pos), _velocity(vel), _density(0.0) {
};
vec3f _velocity;
vec3f _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];
int main() {
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
}
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
//copy host particles to old device particles...
old_parts = h_parts;
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
So the host_vector
has an initial position of (0.5,0.5,0.5)
for all 64 particles. Then the kernel attempts to multiply that by 10 to give (5,5,5)
as the position for all particles. But I don't see this when I cout
the data. It is still just (0.5,0.5,0.5)
. Is there a problem with how I am allocating memory? Is there a problem with the lines:
//copy new device particles back to host particles
h_parts = new_parts;
What could be the issue? Thank you.
There are various problems with the code you have posted.
block
and grid
variables reversed in your kernel invocation. grid
comes first.your method of allocating storage using cudaMalloc
on a pointer which has been raw-cast from an empty device vector is not sensible. The vector container has no knowledge that you did this "under the hood." Instead, you can directly allocate storage for the device vector when you instantiate it, like:
thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);
dt
which is 0.01, so I believe the correct output is 0.05, 0.05, 0.05max_particles
either is not guaranteed to produce enough blocks (if casting a float to int truncates or rounds down) or will produce extra blocks (if it rounds up). The round down case is bad. We should handle that by using a ceil
function or another grid computation method. The round up case (which is what ceil
will do) is OK, but we need to handle the fact that the grid may launch extra blocks/threads. We do that with a thread check in the kernel. There were other problems with the grid computation as well. We want to take the square root of max_particles
, then divide it by the block dimension in a particular direction, to get the grid dimension in that direction.Here's some code that I've modified with these changes in mind, it seems to produce the correct output (0.05, 0.05, 0.05). Note that I had to make some other changes because I don't have your "vec3f.h" header file handy, so I used float3
instead.
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector_functions.h>
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;
using namespace std;
//particle class
class particle {
public:
particle() :
_velocity(make_float3(0,0,0)), _position(make_float3(0,0,0)), _density(0.0)
{
};
particle(const float3& pos, const float3& vel) :
_position(pos), _velocity(vel), _density(0.0)
{
};
float3 _velocity;
float3 _position;
float _density;
};
//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);
int main() {
//global thrust vectors
thrust::host_vector<particle> h_parts;
particle* particle_ptrs[2];
//load host vector
for (int i =0; i<max_particles; i++) {
h_parts.push_back(particle(make_float3(0.5,0.5,0.5),make_float3(10,10,10)));
}
//copy host particles to old device particles...
thrust::device_vector<particle> old_parts = h_parts;
thrust::device_vector<particle> new_parts(max_particles);
particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
//kernel block and grid dimensions
dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 grid((int)ceil(sqrt(float(max_particles)) / (float(block.x))), (int)ceil(sqrt(float(max_particles)) / (float(block.y))), 1);
cout << "grid x: " << grid.x << " grid y: " << grid.y << endl;
kernel_func<<<grid,block>>>(particle_ptrs[0],particle_ptrs[1],dt);
//copy new device particles back to host particles
cudaDeviceSynchronize();
h_parts = new_parts;
for (int i =0; i<max_particles; i++) {
particle temp1 = h_parts[i];
cout << temp1._position.x << "," << temp1._position.y << "," << temp1._position.z << endl;
}
//delete thrust device vectors
old_parts.clear();
old_parts.shrink_to_fit();
new_parts.clear();
new_parts.shrink_to_fit();
return 0;
}
//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
//get array position for 2d grid...
unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
if (arr_pos < max_particles) {
new_parts[arr_pos]._velocity.x = old_parts[arr_pos]._velocity.x * 10.0 * dt;
new_parts[arr_pos]._velocity.y = old_parts[arr_pos]._velocity.y * 10.0 * dt;
new_parts[arr_pos]._velocity.z = old_parts[arr_pos]._velocity.z * 10.0 * dt;
new_parts[arr_pos]._position.x = old_parts[arr_pos]._position.x * 10.0 * dt;
new_parts[arr_pos]._position.y = old_parts[arr_pos]._position.y * 10.0 * dt;
new_parts[arr_pos]._position.z = old_parts[arr_pos]._position.z * 10.0 * dt;
new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}
}