Search code examples
cudafftfftwcufft

1D batched FFTs of real arrays


I have heard/read that we can use the batch mode of cuFFT if we have some n FFTs to perform of some m vectors each. So to test it, I made a sample program and ran it. The data I used was a file with some 1024 floating-point numbers as the same 1024 numbers repeated 10 times. While I should get the same result for 1024 point FFT, I am not getting that. Please correct me if I am conceptually wrong somewhere and below is the code, if you can rectify some error I've made.

Note: I am working with 1D FFT only.

Here is the code snippet:

#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>

#define NX 1024
#define DATASIZE 1024
#define BATCH 10

int main (int argc, char* argv[])
{
        cufftHandle plan;
        cufftComplex *deviceOutputData, *hostOutputData;
        cufftReal *hostInputData, *deviceInputData;
        int i,j;

        FILE *in; // *out, *fp;

        cudaMalloc ((void**)&deviceInputData, NX*BATCH*sizeof(cufftReal));
        hostInputData = (cufftReal*) malloc (NX*BATCH*sizeof(cufftReal));

        cudaMalloc ((void**)&deviceOutputData, NX*BATCH*sizeof(cufftComplex));
        hostOutputData = (cufftComplex*) malloc (NX*BATCH*sizeof(cufftComplex));

        in = fopen ("InFile.txt", "r");

        if (in==NULL)
        {       fprintf (stderr, "Input file has some issues. Please check."); exit(1);}

        float data;
        //Allocate data

 for (i=0; i<BATCH; i++){
                for (j=0; j<DATASIZE;j++)
                {
                        fscanf(in, "%f", &data);
                        hostInputData [j + i*DATASIZE] = data;
                }
        }
        fclose (in);
        cudaMemcpy (deviceInputData, hostInputData, DATASIZE*BATCH*sizeof(cufftReal), cudaMemcpyHostToDevice);
        cufftPlan1d (&plan, NX, CUFFT_R2C, BATCH);
        cufftExecR2C (plan,  deviceInputData, deviceOutputData);
        cudaThreadSynchronize();
        cudaMemcpy (hostOutputData, deviceOutputData, DATASIZE*BATCH*sizeof(cufftComplex), cudaMemcpyDeviceToHost);
        cufftDestroy (plan);
        cudaFree (deviceOutputData);
        cudaFree (deviceInputData);

        #define a hostOutputData[j+i*NX].x
        #define b hostOutputData[j+i*NX].y
        float result[NX];
        for (i=0; i<BATCH; i++){
                printf ("\n*New Batch*\n");
                for (j=0; j<=NX/2;j++){
                        result[j] = sqrt ((a*a)+(b*b));
                        printf ("%f\n", result[j]);
                }

                for (j=1; j<NX/2; j++){
                        result[j+(NX/2)] = result [(NX/2)-j];
                        printf ("%f\n", result[j+(NX/2)]);
                }
        }

Solution

  • As mentioned by Robert Crovella, and as reported in the cuFFT User Guide - CUDA 6.5,

    Batch sizes other than 1 for cufftPlan1d() have been deprecated. Use cufftPlanMany() for multiple batch execution.

    Below, I'm reporting a fully worked example correcting your code and using cufftPlanMany() instead of cufftPlan1d(). As you will see,

    int rank = 1;                           // --- 1D FFTs
    int n[] = { DATASIZE };                 // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = BATCH;                      // --- Number of batched executions
    cufftPlanMany(&handle, rank, n, 
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_R2C, batch);
    

    is totally equivalent to the "old fashioned"

    cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
    

    Be warned that your example does not account for the fact that the 1D FFT of a cufftReal array of length DATASIZE is a cufftComplex array of DATASIZE/2 + 1 elements.

    Here is the full example:

    #include <cuda.h>
    #include <cufft.h>
    #include <stdio.h>
    #include <math.h>
    
    #define DATASIZE 8
    #define BATCH 2
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
    {
       if (code != cudaSuccess) 
       {
          fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
          if (abort) exit(code);
       }
    }
    
    /********/
    /* MAIN */
    /********/
    int main ()
    {
        // --- Host side input data allocation and initialization
        cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
        for (int i=0; i<BATCH; i++)
            for (int j=0; j<DATASIZE; j++) hostInputData[i*DATASIZE + j] = (cufftReal)(i + 1);
    
        // --- Device side input data allocation and initialization
        cufftReal *deviceInputData; gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
        cudaMemcpy(deviceInputData, hostInputData, DATASIZE * BATCH * sizeof(cufftReal), cudaMemcpyHostToDevice);
    
        // --- Host side output data allocation
        cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));
    
        // --- Device side output data allocation
        cufftComplex *deviceOutputData; gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));
    
        // --- Batched 1D FFTs
        cufftHandle handle;
        int rank = 1;                           // --- 1D FFTs
        int n[] = { DATASIZE };                 // --- Size of the Fourier transform
        int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
        int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
        int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
        int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
        int batch = BATCH;                      // --- Number of batched executions
        cufftPlanMany(&handle, rank, n, 
                      inembed, istride, idist,
                      onembed, ostride, odist, CUFFT_R2C, batch);
    
        //cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
        cufftExecR2C(handle,  deviceInputData, deviceOutputData);
    
        // --- Device->Host copy of the results
        gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
    
        for (int i=0; i<BATCH; i++)
            for (int j=0; j<(DATASIZE / 2 + 1); j++)
                printf("%i %i %f %f\n", i, j, hostOutputData[i*(DATASIZE / 2 + 1) + j].x, hostOutputData[i*(DATASIZE / 2 + 1) + j].y);
    
        cufftDestroy(handle);
        gpuErrchk(cudaFree(deviceOutputData));
        gpuErrchk(cudaFree(deviceInputData));
    
    }
    

    Please, add your own cuFFT error checking according to CUFFT error handling.