Search code examples
cudamedian-of-medians

CUDA median calculation through reduction


I'm probably doing something incredibly stupid, but I can't seem to make this reduction work (there is probably a library that does this already, but this is for self-learning, so please bear with me). I'm trying to find the median of an array of integer entries by taking the median of medians approach, which I've coded below:

__global__ void gpuMedOdd(int *entries, int *med) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * blockDim.x + threadIdx.x;

        sdata[tid] = entries[i];
        __syncthreads();

        for(int s = blockDim.x / 3; s > 0; s /= 3) {
                if(tid < s) {
                        int list[3];
                        list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
                        if(list[1] < list[0])
                                swapGpu(list[1], list[0]);
                        if(list[2] < list[0])
                                swapGpu(list[2], list[0]);
                        if(list[2] < list[1])
                                swapGpu(list[2], list[1]);

                        sdata[tid] = list[1];
                }

                __syncthreads();
        }

        *med = sdata[0];
}

I invoke this kernel function as:

gpuMedOdd<<<9, numEntries / 9>>>(d_entries, d_med);

I then copy the value in d_med over into med and print out that value. Unfortunately, this value is always 0, regardless of input. What am I doing wrong?

Edit: I forgot to mention, swapGpu(a, b) is defined as below:

__device__ inline void swapGpu(int a, int b) {
        int dum = a;
        a = b;
        b = dum;
}

Edit2: As suggested below, here is the entirety of the code.

#include <iostream>
#include <fstream>
#include <cstdlib>

#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line) {
        if(cudaSuccess != err) {
                std::cout << file << "(" << line << ") : CUDA Runtime API error " << (int) err << ": " << cudaGetErrorString(err) << std::endl;
                exit(3);
        }
}

inline void __getLastCudaError(const char *errorMsg, const char *file, const int line) {
        cudaError_t err = cudaGetLastError();
        if(cudaSuccess != err) {
                std::cout << file << "(" << line << ") : getLastCudaError() CUDA error : " << errorMsg << " : (" << (int) err << ") " << cudaGetErrorString(err) << std::endl;
                exit(3);
        }
}

int cpuMin(int *entries, int numEntries) {
        int minVal = entries[0];

        for(int i = 1; i < numEntries; i++)
                if(entries[i] < minVal)
                        minVal = entries[i];

        return minVal;
}

int cpuMax(int *entries, int numEntries) {
        int maxVal = entries[0];

        for(int i = 1; i < numEntries; i++)
                if(entries[i] > maxVal)
                        maxVal = entries[i];

        return maxVal;
}

inline void swap(int a, int b) {
        int dum = a;
        a = b;
        b = dum;
}

__device__ inline void swapGpu(int a, int b) {
        int dum = a;
        a = b;
        b = dum;
}

__global__ void gpuMedOdd(int *entries, int *med, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 3) + threadIdx.x;

        if(i + 2 * blockDim.x < numEntries) {
                int list[3];
                list[0] = entries[i], list[1] = entries[i + blockDim.x], list[2] = entries[i + 2 * blockDim.x];
                if(list[1] < list[0])
                        swapGpu(list[1], list[0]);
                if(list[2] < list[0])
                        swapGpu(list[2], list[0]);
                if(list[2] < list[1])
                        swapGpu(list[2], list[1]);

                sdata[tid] = list[1];
        }

        __syncthreads();

        for(int s = blockDim.x / 3; s > 0; s /= 3) {
                if(tid < s && tid + 2 * s < blockDim.x) {
                        int list[3];
                        list[0] = sdata[tid], list[1] = sdata[tid + s], list[2] = sdata[tid + 2 * s];
                        if(list[1] < list[0])
                                swapGpu(list[1], list[0]);
                        if(list[2] < list[0])
                                swapGpu(list[2], list[0]);
                        if(list[2] < list[1])
                                swapGpu(list[2], list[1]);

                        sdata[tid] = list[1];
                }

                __syncthreads();
        }

        *med = sdata[0];
}

__global__ void gpuMin(int *entries, int *min, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

        if(i + blockDim.x < numEntries)
                sdata[tid] = (entries[i] < entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
        __syncthreads();

        for(int s = blockDim.x / 2; s > 0; s >>= 1) {
                if(tid < s)
                        sdata[tid] = (sdata[tid] < sdata[tid + s]) ? sdata[tid] : sdata[tid + s];

                __syncthreads();
        }

        *min = sdata[0];
}

__global__ void gpuMax(int *entries, int *max, int numEntries) {
        extern __shared__ int sdata[];

        int tid = threadIdx.x;
        int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;

        if(i + blockDim.x < numEntries)
                sdata[tid] = (entries[i] > entries[i + blockDim.x]) ? entries[i] : entries[i + blockDim.x];
        __syncthreads();

        for(int s = blockDim.x / 2; s > 0; s >>= 1) {
                if(tid < s)
                        sdata[tid] = (sdata[tid] > sdata[tid + s]) ? sdata[tid] : sdata[tid + s];

            __syncthreads();
    }

    *max = sdata[0];
}

int partition(int *entries, int left, int right, int pivotIdx) {
        int i, storeIdx = left, pivot = entries[pivotIdx];

        swap(entries[pivotIdx], entries[right]);

        for(i = left; i < right; i++)
                if(entries[i] < pivot) {
                        swap(entries[i], entries[storeIdx]);
                        storeIdx++;
                }

        return storeIdx;
}

int cpuSelect(int *entries, int left, int right, int k) {
        if(left == right)
                return entries[left];

        int pivotIdx = ((left + right) >> 2) + 1, pivotNewIdx, pivotDist;

        pivotNewIdx = partition(entries, left, right, pivotIdx);

        pivotDist = pivotNewIdx - left + 1;

        if(pivotDist == k)
                return entries[pivotNewIdx];

        else if(k < pivotDist)
                return cpuSelect(entries, left, pivotNewIdx - 1, k);

        else
                return cpuSelect(entries, pivotNewIdx + 1, right, k - pivotDist);
}

int main(int argc, char *argv[]) {
        if(argc != 3) {
                std::cout << "ERROR: Incorrect number of input arguments" << std::endl;
                std::cout << "Proper usage: " << argv[0] << " fileName numEntries" << std::endl;
                exit(1);
        }

        std::ifstream inp(argv[1]);

        if(!inp.is_open()) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Could not find file " << argv[1] << std::endl;
                exit(2);
        }

        int numEntries = atoi(argv[2]), i = 0;

        int *entries = new int[numEntries];

        while(inp >> entries[i] && i < numEntries)
                i++;

        if(i < numEntries) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Command-line input suggested " << numEntries << " entries, but only found " << i << " entries" << std::endl;
                exit(2);
        }

        if(inp >> i) {
                std::cout << "ERROR: File I/O error" << std::endl;
                std::cout << "Command-line input suggested " << numEntries << " entries, but file contains more entries" << std::endl;
                exit(2);
        }

        int min, max;
        int *d_entries, *d_min, *d_max;

        checkCudaErrors(cudaMalloc(&d_entries, sizeof(int) * numEntries));
        checkCudaErrors(cudaMalloc(&d_min, sizeof(int)));
        checkCudaErrors(cudaMalloc(&d_max, sizeof(int)));

        checkCudaErrors(cudaMemcpy(d_entries, entries, sizeof(int) * numEntries, cudaMemcpyHostToDevice));

        gpuMin<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_min, numEntries);
        getLastCudaError("kernel launch failure");
        gpuMax<<<16, numEntries / 16, numEntries / 16 * sizeof(int)>>>(d_entries, d_max, numEntries);
        getLastCudaError("kernel launch failure");

        checkCudaErrors(cudaMemcpy(&min, d_min, sizeof(int), cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost));

        std::cout << "The minimum value is: " << min << std::endl;
        std::cout << "The maximum value is: " << max << std::endl;

        if(numEntries % 2) {
                int med, *d_med;
                checkCudaErrors(cudaMalloc(&d_med, sizeof(int)));

                gpuMedOdd<<<16, numEntries / 16, 16 * sizeof(int)>>>(d_entries, d_med, numEntries);
                getLastCudaError("kernel launch failure");

                checkCudaErrors(cudaMemcpy(&med, d_med, sizeof(int), cudaMemcpyDeviceToHost));

                std::cout << "The median value is: " << med << std::endl;
        }
        else {
                int *d_med;
                cudaMalloc(&d_med, sizeof(int));
                gpuMedOdd<<<16, numEntries / 16>>>(d_entries, d_med, numEntries);
        }

        min = cpuMin(entries, numEntries);

        max = cpuMax(entries, numEntries);

        if(numEntries % 2) {
                int median = cpuSelect(entries, 0, numEntries - 1, (numEntries - 1) / 2 + 1);
                std::cout << "The median value is: " << median << std::endl;
        }

        else {
                int med2 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2);
                int med1 = cpuSelect(entries, 0, numEntries - 1, numEntries / 2 + 1);

                float median = 0.5 * (med1 + med2);
                std::cout << "The median value is: " << median << std::endl;
        }

        std::cout << "The minimum value is: " << min << std::endl;
        std::cout << "The maximum value is: " << max << std::endl;

        exit(0);
}

Solution

  • One thing that jumps out is that your shared memory size isn't set; that is, you declare your shared memory to be

    extern __shared__ int sdata[];
    

    but when you invoke your kernel your launch parameters are

    gpuMedOdd<<<9, numEntries / 9>>>(...)
    

    If you're setting your __shared__ memory to be extern, then it's expecting to get the number of bytes for shared memory as the 3rd kernel launch parameter. you should instead have

    gpuMedOdd<<<9, numEntries / 9, smem_in_bytes>>>(...)
    

    where smem_in_bytes is the size of shared memory for the kernel. If you don't specify a size, it'll default to 0. Hence in your current code, your __shared__ memory array sdata will be 0 bytes long.

    EDIT: here's the link to the relevant part of the CUDA Programming Guide: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration