I have just started using thrust and one of the biggest issues I have so far is that there seems to be no documentation as to how much memory operations require. So I am not sure why the code below is throwing bad_alloc when trying to sort (before the sorting I still have >50% of GPU memory available, and I have 70GB of RAM available on the CPU)--can anyone shed some light on this?
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/random.h>
void initialize_data(thrust::device_vector<uint64_t>& data) {
thrust::fill(data.begin(), data.end(), 10);
}
int main(void) {
size_t N = 120 * 1024 * 1024;
char line[256];
try {
std::cout << "device_vector" << std::endl;
typedef thrust::device_vector<uint64_t> vec64_t;
// Each buffer is 900MB
vec64_t c[3] = {vec64_t(N), vec64_t(N), vec64_t(N)};
initialize_data(c[0]);
initialize_data(c[1]);
initialize_data(c[2]);
std::cout << "initialize_data finished... Press enter";
std::cin.getline(line, 0);
// nvidia-smi reports 48% memory usage at this point (2959MB of
// 6143MB)
std::cout << "sort_by_key col 0" << std::endl;
// throws bad_alloc
thrust::sort_by_key(c[0].begin(), c[0].end(),
thrust::make_zip_iterator(thrust::make_tuple(c[1].begin(),
c[2].begin())));
std::cout << "sort_by_key col 1" << std::endl;
thrust::sort_by_key(c[1].begin(), c[1].end(),
thrust::make_zip_iterator(thrust::make_tuple(c[0].begin(),
c[2].begin())));
} catch(thrust::system_error &e) {
std::cerr << "Error: " << e.what() << std::endl;
exit(-1);
}
return 0;
}
This is how I compiled the code
nvcc -o ./bad_alloc ./bad_alloc.cu
Taking into account Robert Crovella's comment, this is how the code works for me using cudaMemGetInfo() to use 39% of the GPU RAM (this is on a nvidia tesla card with ECC disabled, otherwise the value would need to be lower).
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/random.h>
void initialize_data(thrust::device_vector<uint64_t>& data) {
thrust::fill(data.begin(), data.end(), 10); }
#define BUFFERS 3
int main(void) {
size_t total_gpu_bytes;
cudaMemGetInfo(0, &total_gpu_bytes);
size_t N = (total_gpu_bytes * .39) / sizeof(uint64_t) / BUFFERS;
try {
std::cout << "device_vector " << (N/1024.0/1024.0) << std::endl;
typedef thrust::device_vector<uint64_t> vec64_t;
vec64_t c[BUFFERS] = {vec64_t(N), vec64_t(N), vec64_t(N)};
initialize_data(c[0]);
initialize_data(c[1]);
initialize_data(c[2]);
thrust::sort_by_key(c[0].begin(), c[0].end(),
thrust::make_zip_iterator(thrust::make_tuple(c[1].begin(),
c[2].begin())));
thrust::sort_by_key(c[1].begin(), c[1].end(),
thrust::make_zip_iterator(thrust::make_tuple(c[0].begin(),
c[2].begin())));
} catch(thrust::system_error &e) {
std::cerr << "Error: " << e.what() << std::endl;
exit(-1);
}
return 0;
}