Search code examples
cudagpgpuconvolution

CUDA constant memory provides no improvement compared to the global memory accesses


I am using 2D convolution and applying filter (3 x 3) to an image (2048 x 2048). I wrote two versions: one uses global memory accesses and another uses constant memory for the filter. When I benchmark the code (on my RTX 3090), I see no improvement with the use of constant memory.

Kernel using global memory

__global__ void gpu_conv2d_kernel(float *d_N_ptr, float *d_F_ptr, float *d_P_ptr, int n_rows, int n_cols)
{
    // Which output element this thread works on
    int out_col = blockIdx.x*blockDim.x + threadIdx.x;
    int out_row = blockIdx.y*blockDim.y + threadIdx.y;
    
    // Check if output element is valid
    if (out_row < n_rows && out_col < n_cols) 
    {
        // Result (in thread register)
        float p_val = 0.0f;
        
        // Loop over elements of the filter array
        for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++) 
        {
            for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++) 
            {
                // Input element to filter element mapping
                int in_row = out_row + (f_row - FILTER_RADIUS);
                int in_col = out_col + (f_col - FILTER_RADIUS);
                        
                // Boundary check
                if (in_row >= 0 && in_row < n_rows && in_col >= 0 && in_col < n_cols) 
                    p_val += d_F_ptr[f_row*(2*FILTER_RADIUS+1) + f_col] * d_N_ptr[in_row*n_cols + in_col];
                }
        }
        d_P_ptr[out_row*n_cols + out_col] = p_val;
    }
}

Kernel using constant memory

#define FILTER_RADIUS 1
extern __constant__ float d_F[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];

__global__ void gpu_conv2d_constMem_kernel(float *d_N_ptr, float *d_P_ptr, int n_rows, int n_cols)
{
    // Which output element this thread works on
    int out_col = blockIdx.x*blockDim.x + threadIdx.x;
    int out_row = blockIdx.y*blockDim.y + threadIdx.y;
    
    // Check if output element is valid
    if (out_row < n_rows && out_col < n_cols) 
    {
        // Result (in thread register)
        float p_val = 0.0f;
        
        // Loop over elements of the filter array
        for (int f_row = 0; f_row < 2*FILTER_RADIUS+1; f_row++) 
        {
            for (int f_col = 0; f_col < 2*FILTER_RADIUS+1; f_col++) 
            {
                // Input element to filter element mapping
                int in_row = out_row + (f_row - FILTER_RADIUS);
                int in_col = out_col + (f_col - FILTER_RADIUS);
                
                // Boundary check
                if (in_row >= 0 && in_row < n_rows && in_col >= 0 && in_col < n_cols) 
                    p_val += d_F[f_row*(2*FILTER_RADIUS+1)+f_col] * d_N_ptr[in_row*n_cols + in_col];
            }
        }
        d_P_ptr[out_row*n_cols + out_col] = p_val;
    }
}

I've written a main function that performs benchmarking for various parts of the code separately (memory allocation, data transfer, kernel execution, etc.).

// All essential includes
// .
// .
// .

#define FILTER_RADIUS 1
__constant__ float d_F[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];

// CUDA Error Checking
#define cuda_check(err) { \
    if (err != cudaSuccess) { \
        std::cout << cudaGetErrorString(err) << " in " << __FILE__ << " at line " << __LINE__ << "\n"; \
        exit(EXIT_FAILURE); \
    } \
}

int main(int argc, char const *argv[])
{
    // Benchmarking variables
    float elapsed_time_mem_alloc, 
            elapsed_time_mem_t_in, elapsed_time_mem_t_f, elapsed_time_mem_t_out, 
            elapsed_time_kernel;
    cudaEvent_t beg, end;
    cudaEventCreate(&beg);
    cudaEventCreate(&end);
    // ---------------------------------------------------------- //
    // ------------------ Load image in memory ------------------ //
    // ---------------------------------------------------------- //
    //.
    //.
    //.
    // ---------------------------------------------------------- //
    // ----------------- GPU memory allocation ------------------ //
    // ---------------------------------------------------------- //
    cudaError_t err;
    
    std::cout << "Allocating GPU memory... \n";
    cudaEventRecord(beg);
    
    float* d_N;
    err = cudaMalloc((void**) &d_N, new_size*new_size*sizeof(float));
    cuda_check(err);

    float *d_P; 
    err = cudaMalloc((void**) &d_P, new_size*new_size*sizeof(float));
    cuda_check(err);

    cudaEventRecord(end);
    cudaEventSynchronize(beg);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed_time_mem_alloc, beg, end);
    elapsed_time_mem_alloc /= 1000.;

    std::cout << "Time for GPU memory allocation (seconds): " << elapsed_time_mem_alloc << "\n";
    std::cout << "\n";

    // ---------------------------------------------------------- //
    // ------------------- Move input to GPU -------------------- //
    // ---------------------------------------------------------- //
    std::cout << "Moving input to GPU memory... \n";
    cudaEventRecord(beg);
    
    err = cudaMemcpy(d_N, N, new_size*new_size*sizeof(float), cudaMemcpyHostToDevice);
    cuda_check(err);

    cudaEventRecord(end);
    cudaEventSynchronize(beg);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed_time_mem_t_in, beg, end);
    elapsed_time_mem_t_in /= 1000.;
    std::cout << "Time for input data transfer (seconds): " << elapsed_time_mem_t_in << "\n";
    std::cout << "\n";

    // ------------------------------------------------------------------------- //
    // ----------------------- Initialize filter ------------------------------- //
    // ------------------------------------------------------------------------- //
    std::string filter_type;
    float *F = new float[(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)];

    int iter = 0;
    while (true)
    {
        // ------------------------------------------------------------------------- //
        // Which filter; Options: Sharpen, High-pass, Low-pass, Gaussian, d_Gaussian //
        // ------------------------------------------------------------------------- //
        std::cout << "Filter options: Sharpen, High-pass, Low-pass, Gaussian, d_Gaussian \n";
        std::cout << "Enter filter (press 'q' to exit): ";
        std::cin >> filter_type;


        // ---------------------------------------------------------- //
        // ---------------- Defining filter matrix ------------------ //
        // ---------------------------------------------------------- //
        if (filter_type == "Sharpen")
        {
            float alpha = 0.8f;
            std::cout << "Enter alpha between 0 and 1 (default: 0.8): ";
            std::cin >> alpha;
            std::cout << "\n";

            F[0] = -alpha/(9-9*alpha);
            F[1] = -alpha/(9-9*alpha);
            F[2] = -alpha/(9-9*alpha);
            F[3] = -alpha/(9-9*alpha);
            F[4] = (9-alpha)/(9-9*alpha);
            F[5] = -alpha/(9-9*alpha);
            F[6] = -alpha/(9-9*alpha);
            F[7] = -alpha/(9-9*alpha);
            F[8] = -alpha/(9-9*alpha);
            
        }
        else if (filter_type == "High-pass")
        {
            std::cout << "\n";   
            F[0] = -1;
            F[1] = -1;
            F[2] = -1;
            F[3] = -1;
            F[4] = 8;
            F[5] = -1;
            F[6] = -1;
            F[7] = -1;
            F[8] = -1;
        }
        else if (filter_type == "Low-pass")
        {
            float alpha = 9.0f;
            std::cout << "Enter alpha (default: 9.0): ";
            std::cin >> alpha;
            std::cout << "\n";

            F[0] = 1/alpha;
            F[1] = 1/alpha;
            F[2] = 1/alpha;
            F[3] = 1/alpha;
            F[4] = 1/alpha;
            F[5] = 1/alpha;
            F[6] = 1/alpha;
            F[7] = 1/alpha;
            F[8] = 1/alpha;
        }
        else if (filter_type == "Gaussian")
        {
            float alpha = 16.0f;
            std::cout << "Enter alpha (default: 16.0): ";
            std::cin >> alpha;
            std::cout << "\n";

            F[0] = 1/alpha;
            F[1] = 2/alpha;
            F[2] = 1/alpha;
            F[3] = 2/alpha;
            F[4] = 3/alpha;
            F[5] = 4/alpha;
            F[6] = 1/alpha;
            F[7] = 2/alpha;
            F[8] = 1/alpha;
        }
        else if (filter_type == "d_Gaussian")
        {
            std::cout << "\n";
            F[0] = -2;
            F[1] = 1;
            F[2] = -2;
            F[3] = 1;
            F[4] = 4;
            F[5] = 1;
            F[6] = -2;
            F[7] = 1;
            F[8] = -2;
        }
        else if (filter_type == "q")
        {
            break;
        }
        else
        {
            std::cout << "Filter not supported!" << "\n";
            std::terminate();
        }

        
        // ---------------------------------------------------------- //
        // ------------------ Move filter to GPU -------------------- //
        // ---------------------------------------------------------- //
        std::cout << "Moving filter to GPU constant memory... \n";
        cudaEventRecord(beg);
        
        err = cudaMemcpyToSymbol(d_F, F, (2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1)*sizeof(float));
        cuda_check(err);
        cudaDeviceSynchronize();

        cudaEventRecord(end);
        cudaEventSynchronize(beg);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&elapsed_time_mem_t_f, beg, end);
        elapsed_time_mem_t_f /= 1000.;
        std::cout << "Time for filter data transfer (seconds): " << elapsed_time_mem_t_f << "\n";
        std::cout << "\n";

        // ---------------------------------------------------------- //
        // --------------------- 2D Convolution --------------------- //
        // ---------------------------------------------------------- //

        // Applying filters frame by frame
        std::cout << "Applying filter... \n"; 

        // Kernel execution
        cudaEventRecord(beg);

        dim3 dim_block(32, 32, 1);
        dim3 dim_grid(ceil(new_size/(float)(32)), ceil(new_size/(float)(32)), 1);
        gpu_conv2d_constMem_kernel<<<dim_grid, dim_block>>>(d_N, d_P, new_size, new_size);
        cudaDeviceSynchronize();
        
        cudaEventRecord(end);
        cudaEventSynchronize(beg);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&elapsed_time_kernel, beg, end);
        elapsed_time_kernel /= 1000.;
        std::cout << "Time for kernel execution (seconds): " << elapsed_time_kernel << "\n";
        std::cout << "\n";

        // ---------------------------------------------------------- //
        // ---------- Copying result back to host memory -------------//
        // ---------------------------------------------------------- //
        std::cout << "Moving result to CPU memory... \n";
        cudaEventRecord(beg);
        
        err = cudaMemcpy(P, d_P, new_size*new_size*sizeof(float), cudaMemcpyDeviceToHost);
        cuda_check(err);
        
        cudaEventRecord(end);
        cudaEventSynchronize(beg);
        cudaEventSynchronize(end);
        cudaEventElapsedTime(&elapsed_time_mem_t_out, beg, end);
        elapsed_time_mem_t_out /= 1000.;
        std::cout << "Time for output data transfer (seconds): " << elapsed_time_mem_t_out << "\n";
        std::cout << "\n";

        // ---------------------------------------------------------- //
        // --------------------- Benchmarking ------------------------//
        // ---------------------------------------------------------- //

        std::cout << "--------------------- \n";
        std::cout << "Benchmarking details: \n";
        std::cout << "--------------------- \n";
        if (iter == 0)
        {
            std::cout << "Time (total): " << elapsed_time_kernel + elapsed_time_mem_alloc + 
                                                elapsed_time_mem_t_in + elapsed_time_mem_t_f + elapsed_time_mem_t_out << "\n";
            std::cout << "FPS (total): " << 1 / (elapsed_time_kernel + elapsed_time_mem_alloc + 
                                                elapsed_time_mem_t_in + elapsed_time_mem_t_f + elapsed_time_mem_t_out) << "\n";
            std::cout << "\n";
        }
        else
        {
            std::cout << "Time (total): " << elapsed_time_kernel +  elapsed_time_mem_t_f + elapsed_time_mem_t_out << "\n";
            std::cout << "FPS (total): " << 1 / (elapsed_time_kernel +  elapsed_time_mem_t_f+ elapsed_time_mem_t_out) << "\n";
            std::cout << "\n";
        }

        std::cout << "Time (kernel): " << elapsed_time_kernel << "\n";
        std::cout << "FPS (kernel): " << 1 / (elapsed_time_kernel) << "\n";
        std::cout << "GFLOPS (kernel): " << 2*new_size*new_size*(2*FILTER_RADIUS+1)*(2*FILTER_RADIUS+1) * 1e-9 / elapsed_time_kernel << "\n";
        std::cout << "------------------------------------ \n";
        std::cout << "\n";

        // ----------------------------------------------------------------- //
        // -------------------- Saving output as jpg ----------------------- //
        // ----------------------------------------------------------------- //
        //.
        //.
        //.
        iter += 1;
    }

    delete[] N;
    delete[] F;
    delete[] P;

    cudaFree(d_N);
    cudaFree(d_P);

    return 0;
}

You can see the code repository here. I'm not sure if there is an issue with the implementation/benchmarking or maybe the problem is not complicated enough (although I tried large input image sizes).


Solution

  • There are 17 operations including bitwise, integer multiplication and addition, floating point multiply & add per access to a filter element. Total latency of all these may be greater than the access latency to a cached global memory buffer.

    You can reduce number of operations per filter access:

    • Remove 7 operations from "if" part: Use zero-padding around the image and only calculate inner part.
    • Remove 1 operation from inner loop: merge 2 loops into 1 and precalculate the delta values outside of loop.
    • Remove 1 operation: unroll the loop
    • Remove more: use a grid-stride loop (adds 1 operation) if you can remove 2 or more operations per unit work, such as out_col and out_row (just these would remove ~0.4 effectively if large number of stride iterations are done).

    Then, optimize the other global memory part (pixel values) by shared-memory tiling:

    • Read global memory coalesced, put the data into shared memory.
    • Access only shared memory (& constant memory) during filter loop.
    • Try reducing shared memory bank collisions by duplicating some values or padding if necessary. 34x34 region could be 37 x 34 for example.

    Then the difference of constant memory can be measured without being hidden behind others. Currently, for the float multiply+add part, you are measuring global + constant parameters vs global + global parameters. Do it shared + global vs shared + constant.

    Also 2k x 2k would take less than a millisecond. To measure better, a bigger image would help. Especially an image that does not fit into L2 cache at all should make the difference more visible for tiling.