Incorrect output when transforming from complex to real number using cuda cuFFT

I am using cuda version 7.5 cufft to perform some FFT and inverse FFT. I have a problem when performing inverse FFT using cufftExecC2R(.,.) function.

Actually, when I use a batch_size = 1 in the cufftPlan1d(,) I get correct result. However, when I increase the batch size the results is incorrect.

I am pasting a sample minimal code to illustrate this. Please ignore the dirtiness of the code as I just quickly created this.

  #include <cufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctime>
#include <iostream>

typedef float2 Complex;

void iTest(int argc, char** argv);

#define SIGNAL_SIZE  9
#define BATCH_SIZE 2

int main(int argc, char** argv) {

    iTest(argc, argv);
    return 0;

}

void iProcess(Complex *x, double *y, size_t n) {

    cufftComplex *deviceData;
    cudaMalloc(reinterpret_cast<void**>(&deviceData),
               SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex));
    cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
               cudaMemcpyHostToDevice);

    cufftResult cufftStatus;
    cufftHandle handle;
    cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE);
    if (cufftStatus != cudaSuccess) {
       printf("cufftPlan1d failed!");
    }

    cufftComplex *d_complex;
    cudaMalloc(reinterpret_cast<void**>(&d_complex),
               sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);

    cufftStatus = cufftExecC2C(handle,  deviceData, d_complex, CUFFT_FORWARD);
    if (cufftStatus != cudaSuccess) {
      printf("cufftExecR2C failed!");
    }

    cufftComplex *hostOutputData = (cufftComplex*)malloc(
       (SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex));

    cudaMemcpy(hostOutputData, d_complex,
               SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
               cudaMemcpyDeviceToHost);

    std::cout << "\nPrinting COMPLEX"  << "\n";
    for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++)
       printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y);


    //! convert complex to real

    cufftHandle c2r_handle;
    cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE);
    if (cufftStatus != cudaSuccess) {
       printf("cufftPlan1d failed!");
    }

    cufftReal *d_odata;
    cudaMalloc(reinterpret_cast<void**>(&d_odata),
               sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE);
    cufftStatus = cufftExecC2R(c2r_handle,  d_complex, d_odata);

    cufftReal odata[SIGNAL_SIZE * BATCH_SIZE];
    cudaMemcpy(odata, d_odata, sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE,
               cudaMemcpyDeviceToHost);

    std::cout << "\nPrinting REAL"  << "\n";
    for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) {
       std::cout << i << " \t" << odata[i]/(SIGNAL_SIZE)  << "\n";
    }


    cufftDestroy(handle);
    cudaFree(deviceData);
}

void iTest(int argc, char** argv) {

    Complex* h_signal = reinterpret_cast<Complex*>(
       malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE));

    std::cout << "\nPrinting INPUT"  << "\n";
    for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) {
       h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
       h_signal[i].y = 0;

       std::cout << i << "\t" << h_signal[i].x  << "\n";
    }
    std::cout  << "\n";

    double y[SIGNAL_SIZE * BATCH_SIZE];
    iProcess(h_signal, y, 1);

}

I cannot find out where is the bug in my code and what information I am missing.

Sample output when using BATCH_SIZE = 1

Sample output when using BATCH_SIZE = 2

Solution

The information that you are missing is that you don't understand that there are data format differences for the input data expected for a C2C transform vs. C2R (or R2C).

You should start by reading this section and this section of the CUFFT documentation.

Note that it says:

Each of those functions demands different input data layout

But you are passing input data that was correct for a C2C transform directly to a C2R transform. That won't work.

The most direct solution IMO is to convert all of your work to C2C transform types. The C2C transform can support both forward (e.g. "real-to-complex") and inverse (e.g. "complex-to-real"). The C2R transform type you are using can also support "complex-to-real", but the data arrangement you would use for C2R differs from the data arrangement you would use for C2C with the inverse path specified, for what is otherwise the same transform. You have not accounted for this.

Here is a worked example showing a modified version of your code that uses C2C for both the forward and inverse paths, and correctly reproduces the input for a batch size of 2:

$ cat t19.cu
#include <cufft.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <ctime>
#include <iostream>

typedef float2 Complex;

void iTest(int argc, char** argv);

#define SIGNAL_SIZE  9
#define BATCH_SIZE 2

int main(int argc, char** argv) {

    iTest(argc, argv);
    return 0;

}

void iProcess(Complex *x, double *y, size_t n) {

    cufftComplex *deviceData;
    cudaMalloc(reinterpret_cast<void**>(&deviceData),
               SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex));
    cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
               cudaMemcpyHostToDevice);

    cufftResult cufftStatus;
    cufftHandle handle;
    cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE);
    if (cufftStatus != cudaSuccess) {
       printf("cufftPlan1d failed!");
    }

    cufftComplex *d_complex;
    cudaMalloc(reinterpret_cast<void**>(&d_complex),
               sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);

    cufftStatus = cufftExecC2C(handle,  deviceData, d_complex, CUFFT_FORWARD);
    if (cufftStatus != cudaSuccess) {
      printf("cufftExecR2C failed!");
    }

    cufftComplex *hostOutputData = (cufftComplex*)malloc(
       (SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex));

    cudaMemcpy(hostOutputData, d_complex,
               SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE,
               cudaMemcpyDeviceToHost);

    std::cout << "\nPrinting COMPLEX"  << "\n";
    for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++)
       printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y);


    //! convert complex to real

/*    cufftHandle c2r_handle;
    cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE);
    if (cufftStatus != cudaSuccess) {
       printf("cufftPlan1d failed!");
    }
*/
    cufftComplex *d_odata;
    cudaMalloc(reinterpret_cast<void**>(&d_odata),
               sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE);
    cufftStatus = cufftExecC2C(handle,  d_complex, d_odata, CUFFT_INVERSE);

    cufftComplex odata[SIGNAL_SIZE * BATCH_SIZE];
    cudaMemcpy(odata, d_odata, sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE,
               cudaMemcpyDeviceToHost);

    std::cout << "\nPrinting REAL"  << "\n";
    for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) {
       std::cout << i << " \t" << odata[i].x/(SIGNAL_SIZE)  << "\n";
    }


    cufftDestroy(handle);
    cudaFree(deviceData);
}

void iTest(int argc, char** argv) {

    Complex* h_signal = reinterpret_cast<Complex*>(
       malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE));

    std::cout << "\nPrinting INPUT"  << "\n";
    for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) {
       h_signal[i].x = rand() / static_cast<float>(RAND_MAX);
       h_signal[i].y = 0;

       std::cout << i << "\t" << h_signal[i].x  << "\n";
    }
    std::cout  << "\n";

    double y[SIGNAL_SIZE * BATCH_SIZE];
    iProcess(h_signal, y, 1);

}
$ nvcc -arch=sm_61 -o t19 t19.cu -lcufft
t19.cu: In function ‘void iProcess(Complex*, double*, size_t)’:
t19.cu:34:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare]
     if (cufftStatus != cudaSuccess) {
                                ^
t19.cu:43:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare]
     if (cufftStatus != cudaSuccess) {
                                ^
$ cuda-memcheck ./t19
========= CUDA-MEMCHECK

Printing INPUT
0       0.840188
1       0.394383
2       0.783099
3       0.79844
4       0.911647
5       0.197551
6       0.335223
7       0.76823
8       0.277775
9       0.55397
10      0.477397
11      0.628871
12      0.364784
13      0.513401
14      0.95223
15      0.916195
16      0.635712
17      0.717297


Printing COMPLEX
0        5.306536        0.000000
1        0.015338        -0.734991
2        -0.218001       0.740248
3        0.307508        -0.706533
4        1.022732        0.271765
5        1.022732        -0.271765
6        0.307508        0.706533
7        -0.218001       -0.740248
8        0.015338        0.734991
9        5.759857        0.000000
10       -0.328981       0.788566
11       0.055356        -0.521014
12       -0.127504       0.581872
13       0.014066        0.123027
14       0.014066        -0.123027
15       -0.127504       -0.581872
16       0.055356        0.521014
17       -0.328981       -0.788566

Printing REAL
0       0.840188
1       0.394383
2       0.783099
3       0.79844
4       0.911647
5       0.197551
6       0.335223
7       0.76823
8       0.277775
9       0.55397
10      0.477397
11      0.628871
12      0.364784
13      0.513401
14      0.95223
15      0.916195
16      0.635712
17      0.717297
========= ERROR SUMMARY: 0 errors
$