Search code examples
cudagradient-descent

Gradient Descent Optimization in CUDA


I will code my first relatively big CUDA project as Gradient Descent Optimization for machine learning purposes. I would like to get benefit from crowd wisdom about some useful native functions of the CUDA that might be short cut to use in the project. Any ideas/suggestions?


Solution

  • Gradient Descent (AKA steepest descent) aims at finding a local minimum of a multivariate function F(x) by taking steps proportional to the negative of the gradient of F(x) at the current point. The update rule is the following:

    enter image description here

    where the step size gamma_n is allowed to change at every step and can be determined, for example, by line searches.

    Implementing the above update rule in CUDA is pretty easy. Below, I'm providing a full example using the Rosenbrock function as the cost functional to be optimized, exploiting the analytical gradient, and considering a constant value for the step size through the iterations (namely, gamma_n = gamma). The Utilities.cu and Utilities.cuh files are mantained at OrangeOwlSolutions/CUDA_Utilities and omitted here. The example implements the CPU as well as the GPU approach.

    **kernel.cu**
    
    #include <stdio.h>
    #include <float.h>
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include "GradientDescentCPU.h"
    #include "GradientDescentGPU.cuh"
    
    #include "Utilities.cuh"
    
    /********/
    /* MAIN */
    /********/
    
    int main()
    {
        /********************/
        /* INPUT PARAMETERS */
        /********************/
    
        // --- Number of unknowns
        const int M = 5;
    
        // --- Starting point
        float *h_x0 = (float*)malloc(M * sizeof(float));
        for (int i=0; i<M; i++) h_x0[i] = 1.2f;
    
        // --- Termination tolerance
        const float tol = 1.e-6;
    
        // --- Maximum number of allowed iterations
        const int maxiter = 10000;
    
        // --- Step size
        const float alpha = 0.001f;
    
        // --- Derivative step
        const float h = 0.0001f;
    
        // --- Minimum allowed perturbations
        const float dxmin = 1e-5;
    
        /*********************/
        /* OUTPUT PARAMETERS */
        /*********************/
    
        // --- Optimal point
        float* h_xopt = (float*)malloc(M * sizeof(float));
        for (int i=0; i<M; i++) h_xopt[i] = 0.f;
    
        // --- Optimal functional
        float fopt = 0.f;
    
        // --- Number of performed iterations
        int niter = 0;
    
        // --- Gradient norm at optimal point
        float gnorm = 0.f;
    
        // --- Distance between last and penultimate solutions found
        float dx = 0.f;
    
        /***************************/
        /* OPTIMIZATION - CPU CASE */
        /***************************/
    
        GradientDescentCPU(h_x0, tol, maxiter, alpha, h, dxmin, M, h_xopt, &fopt, &niter, &gnorm, &dx);
    
        printf("Solution found - CPU case:\n");
        printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
        printf("\n\n");
    
    #ifdef VERBOSE
        printf("Found minimum - CPU case:\n");
        for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
        printf("\n\n");
    #endif
    
        /***************************/
        /* OPTIMIZATION - GPU CASE */
        /***************************/
    
        // --- Starting point
        float *d_x0;    gpuErrchk(cudaMalloc((void**)&d_x0,     M * sizeof(float)));
        gpuErrchk(cudaMemcpy(d_x0, h_x0, M * sizeof(float), cudaMemcpyHostToDevice));
        // --- Optimal point
        float *d_xopt;  gpuErrchk(cudaMalloc((void**)&d_xopt,   M * sizeof(float)));
    
        GradientDescentGPU(d_x0, tol, maxiter, alpha, h, dxmin, M, d_xopt, &fopt, &niter, &gnorm, &dx);
    
        printf("Solution found - GPU case:\n");
        printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
        printf("\n\n");
    
    #ifdef VERBOSE
        gpuErrchk(cudaMemcpy(h_xopt, d_xopt, M * sizeof(float), cudaMemcpyDeviceToHost));
        printf("Found minimum - GPU case:\n");
        for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
        printf("\n\n");
    #endif
        return 0;
    }
    

    GradientDescentCPU.cu

    #include <stdlib.h>
    #include <math.h>
    #include <float.h>
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include "GradientDescentGPU.cuh"
    
    /*******************************/
    /* GRADIENT DESCENT - CPU CASE */
    /*******************************/
    // --- Version using finite differences
    //void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
    //
    //  for (int i=0; i<M; i++) {
    //      h_x[i] = h_x[i] + h / 2.f;
    //      h_g[i] = CostFunction(h_x, M);
    //      h_x[i] = h_x[i] - h;
    //      h_g[i] = (h_g[i] - CostFunction(h_x, M)) / (2.f * h);
    //      h_x[i] = h_x[i] + h / 2.f;
    //  }
    //}
    
    // --- Version using analytical gradient (Rosenbrock function)
    void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
    
        h_g[0] = -400.f * (h_x[1] - h_x[0] * h_x[0]) * h_x[0] + 2.f * (h_x[0] - 1.f);
        for (int i=1; i<M-1; i++) {
            h_g[i]  = -400.f * h_x[i] * (h_x[i+1] - h_x[i] * h_x[i]) + 2.f * (h_x[i] - 1.f) + 200.f * (h_x[i] - h_x[i-1] * h_x[i-1]);
        }
        h_g[M-1] = 200.f * (h_x[M-1] - h_x[M-2] * h_x[M-2]);
    }
    
    /********/
    /* NORM */
    /********/
    
    float normCPU(const float * __restrict h_x, const int M) {
    
        float sum = 0.f;
        for(int i=0; i<M; i++) sum = sum + h_x[i] * h_x[i];
    
        return sqrt(sum);
    
    }
    
    /****************************************/
    /* GRADIENT DESCENT FUNCTION - CPU CASE */
    /****************************************/
    
    // x0      - Starting point
    // tol     - Termination tolerance
    // maxiter - Maximum number of allowed iterations
    // alpha   - Step size
    // dxmin   - Minimum allowed perturbations
    
    void GradientDescentCPU(const float * __restrict h_x0, const float tol, const int maxiter, const float alpha, const float h, const float dxmin, const int M,
                               float * __restrict h_xopt, float *fopt, int *niter, float *gnorm, float *dx) {
    
        // --- Initialize gradient norm, optimization vector, iteration counter, perturbation
    
        *gnorm = FLT_MAX; 
    
        float *h_x = (float *)malloc(M * sizeof(float));
        for (int i=0; i<M; i++) h_x[i] = h_x0[i];
    
        *niter = 0;
    
        *dx = FLT_MAX;
    
        // --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
        float *h_g      = (float *)malloc(M * sizeof(float));
        float *h_xnew   = (float *)malloc(M * sizeof(float));
        float *h_xdiff  = (float *)malloc(M * sizeof(float));
    
        // --- Gradient Descent iterations
        while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {
    
            // --- Calculate gradient
            CostFunctionGradientCPU(h_x, h_g, h, M);
            *gnorm = normCPU(h_g, M);
    
            // --- Take step:
            for (int i=0; i<M; i++) h_xnew[i] = h_x[i] - alpha * h_g[i];
    
            // --- Update termination metrics
            *niter = *niter + 1;
            for (int i=0; i<M; i++) h_xdiff[i] = h_xnew[i] - h_x[i];
            *dx = normCPU(h_xdiff, M);
            for (int i=0; i<M; i++) h_x[i] = h_xnew[i];
        }
    
        for (int i=0; i<M; i++) h_xopt[i] = h_x[i];
        *fopt = CostFunction(h_xopt, M);
        *niter = *niter - 1;
    
    }
    

    GradientDescentCPU.h

    #ifndef GRADIENT_DESCENT_CPU
    #define GRADIENT_DESCENT_CPU
    
    void GradientDescentCPU(const float * __restrict, const float, const int, const float, const float, const float, const int,
                                  float * __restrict, float *, int *, float *, float *);
    
    #endif
    

    GradientDescentGPU.cu

    #include <thrust\device_ptr.h>
    #include <thrust\inner_product.h>
    
    #include "Utilities.cuh"
    
    #define BLOCK_SIZE 256
    
    //#define VERBOSE
    //#define DEBUG
    
    /***********************************/
    /* COST FUNCTION - CPU & GPU CASES */
    /***********************************/
    __host__ __device__ float CostFunction(const float * __restrict h_x, const int M) {
    
        // --- Rosenbrock function
        float sum = 0.f;
        for (int i=0; i<M-1; i++) {
            float temp1 = (h_x[i+1] - h_x[i] * h_x[i]);
            float temp2 = (h_x[i] - 1.f);
            sum = sum + 100.f * temp1 * temp1 + temp2 * temp2;
        }
        return sum;
    }
    
    /*******************************/
    /* GRADIENT DESCENT - GPU CASE */
    /*******************************/
    
    // --- Version using finite differences
    //__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
    //
    //  int test1, test2;
    //  float h_test1_plus, h_test1_minus, h_test2_plus, h_test2_minus, temp1_plus, temp1_minus, temp2_plus, temp2_minus;
    //  
    //  // --- Rosenbrock function
    //  float sum_plus = 0.f, sum_minus = 0.f;
    //  for (int i=0; i<M-1; i++) {
    //      h_test1_plus    = d_x[i] + (h / 2.f) * (tid == i);
    //      h_test1_minus   = d_x[i] - (h / 2.f) * (tid == i);
    //      h_test2_plus    = d_x[i + 1] + (h / 2.f) * (tid == (i + 1));
    //      h_test2_minus   = d_x[i + 1] - (h / 2.f) * (tid == (i + 1));
    //      temp1_plus      = (h_test2_plus - h_test1_plus * h_test1_plus);
    //      temp2_plus      = (h_test1_plus - 1.f);
    //      temp1_minus     = (h_test2_minus - h_test1_minus * h_test1_minus);
    //      temp2_minus     = (h_test1_minus - 1.f);
    //      sum_plus        = sum_plus  + 100.f * temp1_plus  * temp1_plus  + temp2_plus  * temp2_plus;
    //      sum_minus       = sum_minus + 100.f * temp1_minus * temp1_minus + temp2_minus * temp2_minus;
    //  }
    //  d_g[tid] = (sum_plus - sum_minus) / (2.f * h);
    //}
    
    // --- Version using analytical gradient (Rosenbrock function)
    __device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
    
        if (tid == 0) d_g[0] = -400.f * (d_x[1] - d_x[0] * d_x[0]) * d_x[0] + 2.f * (d_x[0] - 1.f);
        else if (tid == M-1) d_g[M-1] = 200.f * (d_x[M-1] - d_x[M-2] * d_x[M-2]);
        else {
            for (int i=1; i<M-1; i++) {
                d_g[i]  = -400.f * d_x[i] * (d_x[i+1] - d_x[i] * d_x[i]) + 2.f * (d_x[i] - 1.f) + 200.f * (d_x[i] - d_x[i-1] * d_x[i-1]);
            }
        }
    }
    
    /*******************/
    /* STEP - GPU CASE */
    /*******************/
    __global__ void StepGPU(float * __restrict d_x, float * __restrict d_xnew, float * __restrict d_xdiff, float * __restrict d_g, const float alpha, const float h, const int M) {
    
        const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < M) {
    
            // --- Calculate gradient
            CostFunctionGradientGPU(d_x, d_g, h, tid, M);
    
            // --- Take step
            d_xnew[tid] = d_x[tid] - alpha * d_g[tid];
    
            // --- Update termination metrics
            d_xdiff[tid] = d_xnew[tid] - d_x[tid];
    
            // --- Update current solution
            d_x[tid] = d_xnew[tid];
        }
    
    }
    
    /***********************************/
    /* COST FUNCTION STRUCT - GPU CASE */
    /***********************************/
    
    // --- Rosenbrock function struct for thrust reduction
    struct CostFunctionStructGPU{
    template <typename Tuple>
        __host__ __device__ float operator()(Tuple a) {
    
            float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a));
            float temp2 = (thrust::get<0>(a) - 1.f);
    
            return 100.f * temp1 * temp1 + temp2 * temp2;
        }
    };
    
    
    /****************************************/
    /* GRADIENT DESCENT FUNCTION - GPU CASE */
    /****************************************/
    
    // x0      - Starting point
    // tol     - Termination tolerance
    // maxiter - Maximum number of allowed iterations
    // alpha   - Step size
    // dxmin   - Minimum allowed perturbations
    
    void GradientDescentGPU(const float * __restrict__ d_x0, const float tol, const int maxiter, const float alpha, const float h, 
                            const float dxmin, const int M, float * __restrict__ d_xopt, float *fopt, int *niter, float *gnorm, float *dx) {
    
        thrust::device_ptr<float> dev_ptr_xopt      = thrust::device_pointer_cast(d_xopt);  
    
        // --- Initialize gradient norm, optimization vector, iteration counter, perturbation
        *gnorm = FLT_MAX; 
    
        float *d_x;         gpuErrchk(cudaMalloc((void**)&d_x, M * sizeof(float)));
        gpuErrchk(cudaMemcpy(d_x, d_x0, M * sizeof(float), cudaMemcpyDeviceToDevice));
    
        *niter = 0;
    
        *dx = FLT_MAX;
    
        // --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
        float *d_g;         gpuErrchk(cudaMalloc((void**)&d_g, M * sizeof(float)));         thrust::device_ptr<float> dev_ptr_g     = thrust::device_pointer_cast(d_g);
        float *d_xnew;      gpuErrchk(cudaMalloc((void**)&d_xnew, M * sizeof(float)));      
        float *d_xdiff;     gpuErrchk(cudaMalloc((void**)&d_xdiff, M * sizeof(float)));     thrust::device_ptr<float> dev_ptr_xdiff = thrust::device_pointer_cast(d_xdiff);
    
        // --- Gradient Descent iterations
        while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {
    
            // --- Iteration step
            StepGPU<<<iDivUp(M, BLOCK_SIZE), BLOCK_SIZE>>>(d_x, d_xnew, d_xdiff, d_g, alpha, h, M);
    #ifdef DEBUG
            gpuErrchk(cudaPeekAtLastError());
            gpuErrchk(cudaDeviceSynchronize());
    #endif
    
            *gnorm  = sqrt(thrust::inner_product(dev_ptr_g,     dev_ptr_g + M,      dev_ptr_g,      0.0f));
            *dx     = sqrt(thrust::inner_product(dev_ptr_xdiff, dev_ptr_xdiff + M,  dev_ptr_xdiff,  0.0f));
            *niter  = *niter + 1;
    
        }
    
        gpuErrchk(cudaMemcpy(d_xopt, d_x, M * sizeof(float), cudaMemcpyDeviceToDevice));
    
        // --- Functional calculation
        *fopt = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt, dev_ptr_xopt + 1)), thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt + M - 1, dev_ptr_xopt + M)), CostFunctionStructGPU(), 0.0f, thrust::plus<float>());
    
        *niter = *niter - 1;
    
    }
    

    GradientDescentGPU.cuh

    #ifndef GRADIENT_DESCENT_GPU
    #define GRADIENT_DESCENT_GPU
    
    void GradientDescentGPU(const float * __restrict__, const float, const int, const float, const float, const float, const int, 
                                  float * __restrict__, float *, int *, float *, float *);
    
    __host__ __device__ float CostFunction(const float * __restrict, const int);
    
    #endif