I'm currently working on a GPU rendering algorithm in which I need to sort an array of this struct:
struct RadiosityData {
vec4 emission;
vec4 radiosity;
float nPixLight;
float nPixCam;
float __padding[2];
};
I am using the following code to sort the array:
thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
thrust::sort(dev_ptr, dev_ptr + N);
where GPUpointer_ssbo is a GPU pointer coming from cudaOpenGL interop, N is equal to ~300k. The comparison is done with:
__host__ __device__ bool operator<(const RadiosityData& lhs, const RadiosityData& rhs) { return (lhs.nPixCam > rhs.nPixCam); };
The sorting is very slow on my GTX960M: without sorting, my aplication is doing ~10ms per frame, while with sorting it is taking around 35ms. This means the sorting is taking ~25ms. I am measuring the exec time with VS-NSIGHT
I am aware that this problem can be a GPU sync problem since I am doing OpenGL operations prior to calling thrust. Nevertheless, I am not convinced by this argument, because if I use the unsorted array to display data with OpenGL, it still takes 10ms total, which means that there is no sync problems with the OpenGL code itself.
Is this performance expected for such "small" array? Is there a better GPU sorting algorithm available for this kind of problem?
------------EDIT: I'm compiling in release with the default VS2019 CUDA command, which is:
Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx) set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Windows Kits\10\" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nvcc.exe" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -o x64\Release\sortBufferCUDA.cu.obj "C:\Users\Jose\Desktop\RealTimeDiffuseIlumination\OpenGL-avanzado\sortBufferCUDA.cu"
Runtime API (NVCC Compilation Type is hybrid object or .c file) set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Windows Kits\10\" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nvcc.exe" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -Xcompiler "/EHsc /nologo /Fd /FS /Zi " -o x64\Release\sortBufferCUDA.cu.obj "C:\Users\Jose\Desktop\RealTimeDiffuseIlumination\OpenGL-avanzado\sortBufferCUDA.cu"
--------------EDIT 2:
The following is a minimal working example:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/device_vector.h>
struct RadiosityData {
float emission[4];
float radiosity[4];
float nPixLight;
float nPixCam;
float __padding[2];
};
extern "C" void CUDAsort();
__host__ __device__ bool operator<(const RadiosityData& lhs, const RadiosityData& rhs) { return (lhs.nPixCam > rhs.nPixCam); };
int pri = 1;
thrust::device_vector<RadiosityData> dev;
void CUDAsort() {
if (pri == 1) {
pri = 0;
dev.resize(300000);
}
thrust::sort(dev.begin(), dev.end());
}
int main()
{
float time;
cudaEvent_t start, stop;
while (true) {
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
CUDAsort();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Time to generate: %3.1f ms \n", time);
}
return 0;
}
Asking thrust to move 48-byte structures around as it is sorting is certainly possible but possibly not the most efficient approach.
What we could try instead is:
float
arrayIt looks like a lot of work, but it's actually a little bit quicker according to my testing:
$ cat t30.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/execution_policy.h>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct RadiosityData {
#ifdef USE_VEC
float4 emission;
float4 radiosity;
#else
float emission[4];
float radiosity[4];
#endif
float nPixLight;
float nPixCam;
float __padding[2];
};
__global__ void copyKernel(RadiosityData *d, float *f, int *i, int n){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n){
f[idx] = d[idx].nPixCam;
i[idx] = idx;}
}
__host__ __device__ bool operator<(const RadiosityData &lhs, const RadiosityData &rhs) { return (lhs.nPixCam > rhs.nPixCam); };
struct my_sort_functor
{
template <typename T1, typename T2>
__host__ __device__ bool operator()(T1 lhs, T2 rhs) { return (lhs.nPixCam > rhs.nPixCam); };
};
const int N = 300000;
int main(){
RadiosityData *GPUpointer_ssbo, *o;
int sz = N*sizeof(RadiosityData);
thrust::device_vector<RadiosityData> ii(N);
GPUpointer_ssbo = thrust::raw_pointer_cast(ii.data());
thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
//method 1: ordinary thrust sort
long long dt = dtime_usec(0);
thrust::sort(dev_ptr, dev_ptr+N);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "ordinary sort time: " << dt/(float)USECPSEC << "s" << std::endl;
//method 2: reduced sort-and-copy
cudaMalloc(&o, sz);
thrust::device_ptr<RadiosityData> dev_optr = thrust::device_pointer_cast(o);
for (int i = 0; i < N; i++) {RadiosityData q{0}; q.nPixCam = rand(); ii[i] = q;}
float *d;
int *k;
cudaMalloc(&d, N*sizeof(float));
cudaMalloc(&k, N*sizeof(int));
thrust::device_ptr<int> dev_kptr = thrust::device_pointer_cast(k);
cudaDeviceSynchronize();
dt = dtime_usec(0);
copyKernel<<<(N+511)/512, 512>>>(GPUpointer_ssbo, d, k, N);
thrust::sort_by_key(thrust::device, d, d+N, k);
thrust::copy(thrust::make_permutation_iterator(dev_ptr, dev_kptr), thrust::make_permutation_iterator(dev_ptr, dev_kptr+N), dev_optr);
cudaMemcpy(GPUpointer_ssbo, o, sz, cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "sort+copy time: " << dt/(float)USECPSEC << "s" << std::endl;
}
$ nvcc -o t30 t30.cu -arch=sm_52
$ ./t30
ordinary sort time: 0.009527s
sort+copy time: 0.003143s
$ nvcc -o t30 t30.cu -arch=sm_52 -DUSE_VEC
$ ./t30
ordinary sort time: 0.004409s
sort+copy time: 0.002859s
$
(CUDA 10.1.105, GTX960, fedora core 29)
So we observe about a 50% or better speed-up with the improved method.
If you only wanted to return the top-M elements of the sort, with this deconstructed-copy method, we can make some further improvements, by reducing the size of the copy operations. The full sort is done on the float quantities, but when it comes to copying the AoS results, only the top-M values are copied:
$ cat t30.cu
#include <thrust/sort.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <thrust/execution_policy.h>
#include <time.h>
#include <sys/time.h>
#include <cstdlib>
#define USECPSEC 1000000ULL
long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct RadiosityData {
#ifdef USE_VEC
float4 emission;
float4 radiosity;
#else
float emission[4];
float radiosity[4];
#endif
float nPixLight;
float nPixCam;
float __padding[2];
};
__global__ void copyKernel(RadiosityData *d, float *f, int *i, int n){
int idx=threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n){
f[idx] = d[idx].nPixCam;
i[idx] = idx;}
}
__host__ __device__ bool operator<(const RadiosityData &lhs, const RadiosityData &rhs) { return (lhs.nPixCam > rhs.nPixCam); };
struct my_sort_functor
{
template <typename T1, typename T2>
__host__ __device__ bool operator()(T1 lhs, T2 rhs) { return (lhs.nPixCam > rhs.nPixCam); };
};
const int N = 300000;
const int M = 1000; // identifies top-M values to be returned by sort
int main(){
RadiosityData *GPUpointer_ssbo, *o;
int sz = N*sizeof(RadiosityData);
thrust::device_vector<RadiosityData> ii(N);
GPUpointer_ssbo = thrust::raw_pointer_cast(ii.data());
thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
//method 1: ordinary thrust sort
long long dt = dtime_usec(0);
thrust::sort(dev_ptr, dev_ptr+N);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "ordinary sort time: " << dt/(float)USECPSEC << "s" << std::endl;
//method 2: reduced sort-and-copy
cudaMalloc(&o, sz);
thrust::device_ptr<RadiosityData> dev_optr = thrust::device_pointer_cast(o);
for (int i = 0; i < N; i++) {RadiosityData q{0}; q.nPixCam = rand(); ii[i] = q;}
float *d;
int *k;
cudaMalloc(&d, N*sizeof(float));
cudaMalloc(&k, N*sizeof(int));
thrust::device_ptr<int> dev_kptr = thrust::device_pointer_cast(k);
cudaDeviceSynchronize();
dt = dtime_usec(0);
copyKernel<<<(N+511)/512, 512>>>(GPUpointer_ssbo, d, k, N);
thrust::sort_by_key(thrust::device, d, d+N, k);
thrust::copy_n(thrust::make_permutation_iterator(dev_ptr, dev_kptr), M, dev_optr);
cudaMemcpy(GPUpointer_ssbo, o, M*sizeof(RadiosityData), cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
std::cout << "sort+copy time: " << dt/(float)USECPSEC << "s" << std::endl;
}
$ nvcc -o t30 t30.cu -arch=sm_52 -lineinfo -DUSE_VEC
$ ./t30
ordinary sort time: 0.004425s
sort+copy time: 0.001042s
$
A few other notes:
It's also observed that the AoS is more efficiently handled by thrust when the 4-float
quantities are represented with a vector type (float4
) instead of a 4-element array. I suspect that this allows the compiler to identify a more efficient structure copy method.
Also note that according to my testing, compiling for the correct GPU architecture (sm_52
in my case) seems to make a small improvement. YMMV.
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.