Search code examples
c++cudafftcufft

Incorrect output when transforming from complex to real number using cuda cuFFT


I am using cuda version 7.5 cufft to perform some FFT and inverse FFT. I have a problem when performing inverse FFT using cufftExecC2R(.,.) function.

Actually, when I use a batch_size = 1 in the cufftPlan1d(,) I get correct result. However, when I increase the batch size the results is incorrect.

I am pasting a sample minimal code to illustrate this. Please ignore the dirtiness of the code as I just quickly created this.

  #include <cufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctime>
#include <iostream>

typedef float2 Complex;

void iTest(int argc, char** argv);

#define SIGNAL_SIZE  9
#define BATCH_SIZE 2

int main(int argc, char** argv) {

    iTest(argc, argv);
    return 0;

}

void iProcess(Complex *x, double *y, size_t n) {

    cufftComplex *deviceData;
    cudaMalloc(reinterpret_cast<void**>(&deviceData),
               SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex));
    cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
               cudaMemcpyHostToDevice);

    cufftResult cufftStatus;
    cufftHandle handle;
    cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE);
    if (cufftStatus != cudaSuccess) {
       printf("cufftPlan1d failed!");
    }

    cufftComplex *d_complex;
    cudaMalloc(reinterpret_cast<void**>(&d_complex),
               sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);

    cufftStatus = cufftExecC2C(handle,  deviceData, d_complex, CUFFT_FORWARD);
    if (cufftStatus != cudaSuccess) {
      printf("cufftExecR2C failed!");
    }

    cufftComplex *hostOutputData = (cufftComplex*)malloc(
       (SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex));

    cudaMemcpy(hostOutputData, d_complex,
               SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
               cudaMemcpyDeviceToHost);

    std::cout << "\nPrinting COMPLEX"  << "\n";
    for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++)
       printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y);


    //! convert complex to real

    cufftHandle c2r_handle;
    cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE);
    if (cufftStatus != cudaSuccess) {
       printf("cufftPlan1d failed!");
    }

    cufftReal *d_odata;
    cudaMalloc(reinterpret_cast<void**>(&d_odata),
               sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE);
    cufftStatus = cufftExecC2R(c2r_handle,  d_complex, d_odata);

    cufftReal odata[SIGNAL_SIZE * BATCH_SIZE];
    cudaMemcpy(odata, d_odata, sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE,
               cudaMemcpyDeviceToHost);

    std::cout << "\nPrinting REAL"  << "\n";
    for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) {
       std::cout << i << " \t" << odata[i]/(SIGNAL_SIZE)  << "\n";
    }


    cufftDestroy(handle);
    cudaFree(deviceData);
}

void iTest(int argc, char** argv) {

    Complex* h_signal = reinterpret_cast<Complex*>(
       malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE));

    std::cout << "\nPrinting INPUT"  << "\n";
    for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) {
       h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
       h_signal[i].y = 0;

       std::cout << i << "\t" << h_signal[i].x  << "\n";
    }
    std::cout  << "\n";

    double y[SIGNAL_SIZE * BATCH_SIZE];
    iProcess(h_signal, y, 1);

}

I cannot find out where is the bug in my code and what information I am missing.

Sample output when using BATCH_SIZE = 1

Image 1

Sample output when using BATCH_SIZE = 2 image 2


Solution

  • The information that you are missing is that you don't understand that there are data format differences for the input data expected for a C2C transform vs. C2R (or R2C).

    You should start by reading this section and this section of the CUFFT documentation.

    Note that it says:

    Each of those functions demands different input data layout

    But you are passing input data that was correct for a C2C transform directly to a C2R transform. That won't work.

    The most direct solution IMO is to convert all of your work to C2C transform types. The C2C transform can support both forward (e.g. "real-to-complex") and inverse (e.g. "complex-to-real"). The C2R transform type you are using can also support "complex-to-real", but the data arrangement you would use for C2R differs from the data arrangement you would use for C2C with the inverse path specified, for what is otherwise the same transform. You have not accounted for this.

    Here is a worked example showing a modified version of your code that uses C2C for both the forward and inverse paths, and correctly reproduces the input for a batch size of 2:

    $ cat t19.cu
    #include <cufft.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <ctime>
    #include <iostream>
    
    typedef float2 Complex;
    
    void iTest(int argc, char** argv);
    
    #define SIGNAL_SIZE  9
    #define BATCH_SIZE 2
    
    int main(int argc, char** argv) {
    
        iTest(argc, argv);
        return 0;
    
    }
    
    void iProcess(Complex *x, double *y, size_t n) {
    
        cufftComplex *deviceData;
        cudaMalloc(reinterpret_cast<void**>(&deviceData),
                   SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex));
        cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
                   cudaMemcpyHostToDevice);
    
        cufftResult cufftStatus;
        cufftHandle handle;
        cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE);
        if (cufftStatus != cudaSuccess) {
           printf("cufftPlan1d failed!");
        }
    
        cufftComplex *d_complex;
        cudaMalloc(reinterpret_cast<void**>(&d_complex),
                   sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);
    
        cufftStatus = cufftExecC2C(handle,  deviceData, d_complex, CUFFT_FORWARD);
        if (cufftStatus != cudaSuccess) {
          printf("cufftExecR2C failed!");
        }
    
        cufftComplex *hostOutputData = (cufftComplex*)malloc(
           (SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex));
    
        cudaMemcpy(hostOutputData, d_complex,
                   SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
                   cudaMemcpyDeviceToHost);
    
        std::cout << "\nPrinting COMPLEX"  << "\n";
        for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++)
           printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y);
    
    
        //! convert complex to real
    
    /*    cufftHandle c2r_handle;
        cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE);
        if (cufftStatus != cudaSuccess) {
           printf("cufftPlan1d failed!");
        }
    */
        cufftComplex *d_odata;
        cudaMalloc(reinterpret_cast<void**>(&d_odata),
                   sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);
        cufftStatus = cufftExecC2C(handle,  d_complex, d_odata, CUFFT_INVERSE);
    
        cufftComplex odata[SIGNAL_SIZE * BATCH_SIZE];
        cudaMemcpy(odata, d_odata, sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE,
                   cudaMemcpyDeviceToHost);
    
        std::cout << "\nPrinting REAL"  << "\n";
        for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) {
           std::cout << i << " \t" << odata[i].x/(SIGNAL_SIZE)  << "\n";
        }
    
    
        cufftDestroy(handle);
        cudaFree(deviceData);
    }
    
    void iTest(int argc, char** argv) {
    
        Complex* h_signal = reinterpret_cast<Complex*>(
           malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE));
    
        std::cout << "\nPrinting INPUT"  << "\n";
        for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) {
           h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
           h_signal[i].y = 0;
    
           std::cout << i << "\t" << h_signal[i].x  << "\n";
        }
        std::cout  << "\n";
    
        double y[SIGNAL_SIZE * BATCH_SIZE];
        iProcess(h_signal, y, 1);
    
    }
    $ nvcc -arch=sm_61 -o t19 t19.cu -lcufft
    t19.cu: In function ‘void iProcess(Complex*, double*, size_t)’:
    t19.cu:34:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare]
         if (cufftStatus != cudaSuccess) {
                                    ^
    t19.cu:43:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare]
         if (cufftStatus != cudaSuccess) {
                                    ^
    $ cuda-memcheck ./t19
    ========= CUDA-MEMCHECK
    
    Printing INPUT
    0       0.840188
    1       0.394383
    2       0.783099
    3       0.79844
    4       0.911647
    5       0.197551
    6       0.335223
    7       0.76823
    8       0.277775
    9       0.55397
    10      0.477397
    11      0.628871
    12      0.364784
    13      0.513401
    14      0.95223
    15      0.916195
    16      0.635712
    17      0.717297
    
    
    Printing COMPLEX
    0        5.306536        0.000000
    1        0.015338        -0.734991
    2        -0.218001       0.740248
    3        0.307508        -0.706533
    4        1.022732        0.271765
    5        1.022732        -0.271765
    6        0.307508        0.706533
    7        -0.218001       -0.740248
    8        0.015338        0.734991
    9        5.759857        0.000000
    10       -0.328981       0.788566
    11       0.055356        -0.521014
    12       -0.127504       0.581872
    13       0.014066        0.123027
    14       0.014066        -0.123027
    15       -0.127504       -0.581872
    16       0.055356        0.521014
    17       -0.328981       -0.788566
    
    Printing REAL
    0       0.840188
    1       0.394383
    2       0.783099
    3       0.79844
    4       0.911647
    5       0.197551
    6       0.335223
    7       0.76823
    8       0.277775
    9       0.55397
    10      0.477397
    11      0.628871
    12      0.364784
    13      0.513401
    14      0.95223
    15      0.916195
    16      0.635712
    17      0.717297
    ========= ERROR SUMMARY: 0 errors
    $