Search code examples
cudagpucublas

Is there a function in the cublas that can apply the sigmoid function with a vector?


As the title says, I want to do the element-wise operation in the vector with a function.I wonder that is there any function in the cublas library to do that?


Solution

  • I am not aware of a suitable CUBLAS function that can assist in the task. However, you can easily write your own code that applies the sigmoid function, or any other single-argument function for that matter, element-wise to a vector. Note that such code would be memory-bound rather than compute-bound in most circumstances. See the CUDA program below for a worked example, in particular sigmoid_kernel(). The output of the program should look something like this:

    source[0]= 0.0000000000000000e+000  source[99999]= 9.9999000000000005e-001
    result[0]= 5.0000000000000000e-001  result[99999]= 7.3105661250612963e-001
    

    .

    #include <stdlib.h>
    #include <stdio.h>
    #include <math.h>
    
    #define DEFAULT_LEN   100000
    
    // Macro to catch CUDA errors in CUDA runtime calls
    #define CUDA_SAFE_CALL(call)                                          \
    do {                                                                  \
        cudaError_t err = call;                                           \
        if (cudaSuccess != err) {                                         \
            fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                     __FILE__, __LINE__, cudaGetErrorString(err) );       \
            exit(EXIT_FAILURE);                                           \
        }                                                                 \
    } while (0)
    
    // Macro to catch CUDA errors in kernel launches
    #define CHECK_LAUNCH_ERROR()                                          \
    do {                                                                  \
        /* Check synchronous errors, i.e. pre-launch */                   \
        cudaError_t err = cudaGetLastError();                             \
        if (cudaSuccess != err) {                                         \
            fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                     __FILE__, __LINE__, cudaGetErrorString(err) );       \
            exit(EXIT_FAILURE);                                           \
        }                                                                 \
        /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
        err = cudaThreadSynchronize();                                    \
        if (cudaSuccess != err) {                                         \
            fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                     __FILE__, __LINE__, cudaGetErrorString( err) );      \
            exit(EXIT_FAILURE);                                           \
        }                                                                 \
    } while (0)
    
    __device__ __forceinline__ double sigmoid (double a)
    {
        return 1.0 / (1.0 + exp (-a));
    }
    
    __global__ void sigmoid_kernel (const double * __restrict__ src, 
                                    double * __restrict__ dst, int len)
    {
        int stride = gridDim.x * blockDim.x;
        int tid = blockDim.x * blockIdx.x + threadIdx.x;
        for (int i = tid; i < len; i += stride) {
            dst[i] = sigmoid (src[i]);
        }
    }    
    
    int main (void)
    {
        double *source, *result;
        double *d_a = 0, *d_b = 0;
    
        int len = DEFAULT_LEN;
    
        /* Allocate memory on host */
        source = (double *)malloc (len * sizeof (source[0]));
        if (!source) return EXIT_FAILURE;
        result = (double *)malloc (len * sizeof (result[0]));
        if (!result) return EXIT_FAILURE;
    
        /* create source data */
        for (int i = 0; i < len; i++) source [i] = i * 1e-5;
    
        /* spot check of source data */
        printf ("source[0]=% 23.16e  source[%d]=% 23.16e\n", 
                source[0], len-1, source[len-1]);
    
        /* Allocate memory on device */
        CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
        CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));
    
        /* Push source data to device */
        CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
                                    cudaMemcpyHostToDevice));
    
        /* Compute execution configuration */
        dim3 dimBlock(256);
        int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
        if (threadBlocks > 65520) threadBlocks = 65520;
        dim3 dimGrid(threadBlocks);
    
        sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
        CHECK_LAUNCH_ERROR();
    
        /* retrieve results from device */
        CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
                                    cudaMemcpyDeviceToHost));
    
        /* spot check of  results */
        printf ("result[0]=% 23.16e  result[%d]=% 23.16e\n", 
                result[0], len-1, result[len-1]);
    
        /* free memory on host and device */
        CUDA_SAFE_CALL (cudaFree(d_a));
        CUDA_SAFE_CALL (cudaFree(d_b));
        free (result);
        free (source);
    
        return EXIT_SUCCESS;
    }