Gradient Descent Optimization in CUDA

I will code my first relatively big CUDA project as Gradient Descent Optimization for machine learning purposes. I would like to get benefit from crowd wisdom about some useful native functions of the CUDA that might be short cut to use in the project. Any ideas/suggestions?

Solution

Gradient Descent (AKA steepest descent) aims at finding a local minimum of a multivariate function F(x) by taking steps proportional to the negative of the gradient of F(x) at the current point. The update rule is the following:

enter image description here

where the step size gamma_n is allowed to change at every step and can be determined, for example, by line searches.

Implementing the above update rule in CUDA is pretty easy. Below, I'm providing a full example using the Rosenbrock function as the cost functional to be optimized, exploiting the analytical gradient, and considering a constant value for the step size through the iterations (namely, gamma_n = gamma). The Utilities.cu and Utilities.cuh files are mantained at OrangeOwlSolutions/CUDA_Utilities and omitted here. The example implements the CPU as well as the GPU approach.

**kernel.cu**

#include <stdio.h>
#include <float.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "GradientDescentCPU.h"
#include "GradientDescentGPU.cuh"

#include "Utilities.cuh"

/********/
/* MAIN */
/********/

int main()
{
    /********************/
    /* INPUT PARAMETERS */
    /********************/

    // --- Number of unknowns
    const int M = 5;

    // --- Starting point
    float *h_x0 = (float*)malloc(M * sizeof(float));
    for (int i=0; i<M; i++) h_x0[i] = 1.2f;

    // --- Termination tolerance
    const float tol = 1.e-6;

    // --- Maximum number of allowed iterations
    const int maxiter = 10000;

    // --- Step size
    const float alpha = 0.001f;

    // --- Derivative step
    const float h = 0.0001f;

    // --- Minimum allowed perturbations
    const float dxmin = 1e-5;

    /*********************/
    /* OUTPUT PARAMETERS */
    /*********************/

    // --- Optimal point
    float* h_xopt = (float*)malloc(M * sizeof(float));
    for (int i=0; i<M; i++) h_xopt[i] = 0.f;

    // --- Optimal functional
    float fopt = 0.f;

    // --- Number of performed iterations
    int niter = 0;

    // --- Gradient norm at optimal point
    float gnorm = 0.f;

    // --- Distance between last and penultimate solutions found
    float dx = 0.f;

    /***************************/
    /* OPTIMIZATION - CPU CASE */
    /***************************/

    GradientDescentCPU(h_x0, tol, maxiter, alpha, h, dxmin, M, h_xopt, &fopt, &niter, &gnorm, &dx);

    printf("Solution found - CPU case:\n");
    printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
    printf("\n\n");

#ifdef VERBOSE
    printf("Found minimum - CPU case:\n");
    for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
    printf("\n\n");
#endif

    /***************************/
    /* OPTIMIZATION - GPU CASE */
    /***************************/

    // --- Starting point
    float *d_x0;    gpuErrchk(cudaMalloc((void**)&d_x0,     M * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_x0, h_x0, M * sizeof(float), cudaMemcpyHostToDevice));
    // --- Optimal point
    float *d_xopt;  gpuErrchk(cudaMalloc((void**)&d_xopt,   M * sizeof(float)));

    GradientDescentGPU(d_x0, tol, maxiter, alpha, h, dxmin, M, d_xopt, &fopt, &niter, &gnorm, &dx);

    printf("Solution found - GPU case:\n");
    printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
    printf("\n\n");

#ifdef VERBOSE
    gpuErrchk(cudaMemcpy(h_xopt, d_xopt, M * sizeof(float), cudaMemcpyDeviceToHost));
    printf("Found minimum - GPU case:\n");
    for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
    printf("\n\n");
#endif
    return 0;
}

GradientDescentCPU.cu

#include <stdlib.h>
#include <math.h>
#include <float.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "GradientDescentGPU.cuh"

/*******************************/
/* GRADIENT DESCENT - CPU CASE */
/*******************************/
// --- Version using finite differences
//void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
//
//  for (int i=0; i<M; i++) {
//      h_x[i] = h_x[i] + h / 2.f;
//      h_g[i] = CostFunction(h_x, M);
//      h_x[i] = h_x[i] - h;
//      h_g[i] = (h_g[i] - CostFunction(h_x, M)) / (2.f * h);
//      h_x[i] = h_x[i] + h / 2.f;
//  }
//}

// --- Version using analytical gradient (Rosenbrock function)
void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {

    h_g[0] = -400.f * (h_x[1] - h_x[0] * h_x[0]) * h_x[0] + 2.f * (h_x[0] - 1.f);
    for (int i=1; i<M-1; i++) {
        h_g[i]  = -400.f * h_x[i] * (h_x[i+1] - h_x[i] * h_x[i]) + 2.f * (h_x[i] - 1.f) + 200.f * (h_x[i] - h_x[i-1] * h_x[i-1]);
    }
    h_g[M-1] = 200.f * (h_x[M-1] - h_x[M-2] * h_x[M-2]);
}

/********/
/* NORM */
/********/

float normCPU(const float * __restrict h_x, const int M) {

    float sum = 0.f;
    for(int i=0; i<M; i++) sum = sum + h_x[i] * h_x[i];

    return sqrt(sum);

}

/****************************************/
/* GRADIENT DESCENT FUNCTION - CPU CASE */
/****************************************/

// x0      - Starting point
// tol     - Termination tolerance
// maxiter - Maximum number of allowed iterations
// alpha   - Step size
// dxmin   - Minimum allowed perturbations

void GradientDescentCPU(const float * __restrict h_x0, const float tol, const int maxiter, const float alpha, const float h, const float dxmin, const int M,
                           float * __restrict h_xopt, float *fopt, int *niter, float *gnorm, float *dx) {

    // --- Initialize gradient norm, optimization vector, iteration counter, perturbation

    *gnorm = FLT_MAX; 

    float *h_x = (float *)malloc(M * sizeof(float));
    for (int i=0; i<M; i++) h_x[i] = h_x0[i];

    *niter = 0;

    *dx = FLT_MAX;

    // --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
    float *h_g      = (float *)malloc(M * sizeof(float));
    float *h_xnew   = (float *)malloc(M * sizeof(float));
    float *h_xdiff  = (float *)malloc(M * sizeof(float));

    // --- Gradient Descent iterations
    while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {

        // --- Calculate gradient
        CostFunctionGradientCPU(h_x, h_g, h, M);
        *gnorm = normCPU(h_g, M);

        // --- Take step:
        for (int i=0; i<M; i++) h_xnew[i] = h_x[i] - alpha * h_g[i];

        // --- Update termination metrics
        *niter = *niter + 1;
        for (int i=0; i<M; i++) h_xdiff[i] = h_xnew[i] - h_x[i];
        *dx = normCPU(h_xdiff, M);
        for (int i=0; i<M; i++) h_x[i] = h_xnew[i];
    }

    for (int i=0; i<M; i++) h_xopt[i] = h_x[i];
    *fopt = CostFunction(h_xopt, M);
    *niter = *niter - 1;

}

GradientDescentCPU.h

#ifndef GRADIENT_DESCENT_CPU
#define GRADIENT_DESCENT_CPU

void GradientDescentCPU(const float * __restrict, const float, const int, const float, const float, const float, const int,
                              float * __restrict, float *, int *, float *, float *);

#endif

GradientDescentGPU.cu

#include <thrust\device_ptr.h>
#include <thrust\inner_product.h>

#include "Utilities.cuh"

#define BLOCK_SIZE 256

//#define VERBOSE
//#define DEBUG

/***********************************/
/* COST FUNCTION - CPU & GPU CASES */
/***********************************/
__host__ __device__ float CostFunction(const float * __restrict h_x, const int M) {

    // --- Rosenbrock function
    float sum = 0.f;
    for (int i=0; i<M-1; i++) {
        float temp1 = (h_x[i+1] - h_x[i] * h_x[i]);
        float temp2 = (h_x[i] - 1.f);
        sum = sum + 100.f * temp1 * temp1 + temp2 * temp2;
    }
    return sum;
}

/*******************************/
/* GRADIENT DESCENT - GPU CASE */
/*******************************/

// --- Version using finite differences
//__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
//
//  int test1, test2;
//  float h_test1_plus, h_test1_minus, h_test2_plus, h_test2_minus, temp1_plus, temp1_minus, temp2_plus, temp2_minus;
//  
//  // --- Rosenbrock function
//  float sum_plus = 0.f, sum_minus = 0.f;
//  for (int i=0; i<M-1; i++) {
//      h_test1_plus    = d_x[i] + (h / 2.f) * (tid == i);
//      h_test1_minus   = d_x[i] - (h / 2.f) * (tid == i);
//      h_test2_plus    = d_x[i + 1] + (h / 2.f) * (tid == (i + 1));
//      h_test2_minus   = d_x[i + 1] - (h / 2.f) * (tid == (i + 1));
//      temp1_plus      = (h_test2_plus - h_test1_plus * h_test1_plus);
//      temp2_plus      = (h_test1_plus - 1.f);
//      temp1_minus     = (h_test2_minus - h_test1_minus * h_test1_minus);
//      temp2_minus     = (h_test1_minus - 1.f);
//      sum_plus        = sum_plus  + 100.f * temp1_plus  * temp1_plus  + temp2_plus  * temp2_plus;
//      sum_minus       = sum_minus + 100.f * temp1_minus * temp1_minus + temp2_minus * temp2_minus;
//  }
//  d_g[tid] = (sum_plus - sum_minus) / (2.f * h);
//}

// --- Version using analytical gradient (Rosenbrock function)
__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {

    if (tid == 0) d_g[0] = -400.f * (d_x[1] - d_x[0] * d_x[0]) * d_x[0] + 2.f * (d_x[0] - 1.f);
    else if (tid == M-1) d_g[M-1] = 200.f * (d_x[M-1] - d_x[M-2] * d_x[M-2]);
    else {
        for (int i=1; i<M-1; i++) {
            d_g[i]  = -400.f * d_x[i] * (d_x[i+1] - d_x[i] * d_x[i]) + 2.f * (d_x[i] - 1.f) + 200.f * (d_x[i] - d_x[i-1] * d_x[i-1]);
        }
    }
}

/*******************/
/* STEP - GPU CASE */
/*******************/
__global__ void StepGPU(float * __restrict d_x, float * __restrict d_xnew, float * __restrict d_xdiff, float * __restrict d_g, const float alpha, const float h, const int M) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < M) {

        // --- Calculate gradient
        CostFunctionGradientGPU(d_x, d_g, h, tid, M);

        // --- Take step
        d_xnew[tid] = d_x[tid] - alpha * d_g[tid];

        // --- Update termination metrics
        d_xdiff[tid] = d_xnew[tid] - d_x[tid];

        // --- Update current solution
        d_x[tid] = d_xnew[tid];
    }

}

/***********************************/
/* COST FUNCTION STRUCT - GPU CASE */
/***********************************/

// --- Rosenbrock function struct for thrust reduction
struct CostFunctionStructGPU{
template <typename Tuple>
    __host__ __device__ float operator()(Tuple a) {

        float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a));
        float temp2 = (thrust::get<0>(a) - 1.f);

        return 100.f * temp1 * temp1 + temp2 * temp2;
    }
};


/****************************************/
/* GRADIENT DESCENT FUNCTION - GPU CASE */
/****************************************/

// x0      - Starting point
// tol     - Termination tolerance
// maxiter - Maximum number of allowed iterations
// alpha   - Step size
// dxmin   - Minimum allowed perturbations

void GradientDescentGPU(const float * __restrict__ d_x0, const float tol, const int maxiter, const float alpha, const float h, 
                        const float dxmin, const int M, float * __restrict__ d_xopt, float *fopt, int *niter, float *gnorm, float *dx) {

    thrust::device_ptr<float> dev_ptr_xopt      = thrust::device_pointer_cast(d_xopt);  

    // --- Initialize gradient norm, optimization vector, iteration counter, perturbation
    *gnorm = FLT_MAX; 

    float *d_x;         gpuErrchk(cudaMalloc((void**)&d_x, M * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_x, d_x0, M * sizeof(float), cudaMemcpyDeviceToDevice));

    *niter = 0;

    *dx = FLT_MAX;

    // --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
    float *d_g;         gpuErrchk(cudaMalloc((void**)&d_g, M * sizeof(float)));         thrust::device_ptr<float> dev_ptr_g     = thrust::device_pointer_cast(d_g);
    float *d_xnew;      gpuErrchk(cudaMalloc((void**)&d_xnew, M * sizeof(float)));      
    float *d_xdiff;     gpuErrchk(cudaMalloc((void**)&d_xdiff, M * sizeof(float)));     thrust::device_ptr<float> dev_ptr_xdiff = thrust::device_pointer_cast(d_xdiff);

    // --- Gradient Descent iterations
    while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {

        // --- Iteration step
        StepGPU<<<iDivUp(M, BLOCK_SIZE), BLOCK_SIZE>>>(d_x, d_xnew, d_xdiff, d_g, alpha, h, M);
#ifdef DEBUG
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
#endif

        *gnorm  = sqrt(thrust::inner_product(dev_ptr_g,     dev_ptr_g + M,      dev_ptr_g,      0.0f));
        *dx     = sqrt(thrust::inner_product(dev_ptr_xdiff, dev_ptr_xdiff + M,  dev_ptr_xdiff,  0.0f));
        *niter  = *niter + 1;

    }

    gpuErrchk(cudaMemcpy(d_xopt, d_x, M * sizeof(float), cudaMemcpyDeviceToDevice));

    // --- Functional calculation
    *fopt = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt, dev_ptr_xopt + 1)), thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt + M - 1, dev_ptr_xopt + M)), CostFunctionStructGPU(), 0.0f, thrust::plus<float>());

    *niter = *niter - 1;

}

GradientDescentGPU.cuh

#ifndef GRADIENT_DESCENT_GPU
#define GRADIENT_DESCENT_GPU

void GradientDescentGPU(const float * __restrict__, const float, const int, const float, const float, const float, const int, 
                              float * __restrict__, float *, int *, float *, float *);

__host__ __device__ float CostFunction(const float * __restrict, const int);

#endif