Search code examples
cudacufft

cuFFT output not correct


I have a problem with this program:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cufft.h>
#include <cuComplex.h>

#define SIGNAL_SIZE        1024

int main(int argc, char **argv) {
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   // Allocate host memory for the signal
   cuDoubleComplex *h_signal = (cuDoubleComplex *) malloc(sizeof(cuDoubleComplex) * SIGNAL_SIZE);

   // Initalize the memory for the signal
   for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
      if((double)i/SIGNAL_SIZE>=0 && (double)i/SIGNAL_SIZE<0.5)  h_signal[i].x = (double)i/SIGNAL_SIZE;
      else if((double)i/SIGNAL_SIZE>=0.5 && (double)i/SIGNAL_SIZE<1)  h_signal[i].x = (double)i/SIGNAL_SIZE-1;
      h_signal[i].y = 0;
   }

// Allocate device memory for signal
   cuDoubleComplex *d_signal;

   cudaMalloc((void **) &d_signal, SIGNAL_SIZE*sizeof(cuDoubleComplex));
   // Copy host memory to device
   cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);


cudaEventRecord(start, 0);
   cufftHandle plan;
   cufftPlan1d(&plan, SIGNAL_SIZE , CUFFT_C2C, 1);

   // FFT computation
   cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal,
         CUFFT_FORWARD);

    cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_INVERSE);

   cuDoubleComplex *h_signal_inv =(cuDoubleComplex *) malloc(sizeof(cuDoubleComplex) * SIGNAL_SIZE);
   cudaMemcpy(h_signal_inv, d_signal, sizeof(cuDoubleComplex) * SIGNAL_SIZE, cudaMemcpyDeviceToHost);


   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);

   float elapsedTime;
   cudaEventElapsedTime(&elapsedTime, start, stop);
   printf("Elapsed Time:  %3.1f ms\n", elapsedTime);


    for(int i=0;i<SIGNAL_SIZE;i++) printf("\n%f %f", h_signal[i].x, h_signal_inv[i].x);

    cufftDestroy(plan);

   free(h_signal);
   free(h_signal_inv);

   cudaFree(d_signal);

   cudaDeviceReset();
   return 0;
}

I'd like to transform a signal and then come back with the inverse, but the output is wrong in the first half.

Can you help me to find the errors?

Thank you very much!


Solution

  • You are getting your datatypes confused.

    cufftDoubleComplex is not the same as cufftComplex. When using cufftDoubleComplex, your transform type should be Z2Z, not C2C.

    Also, in order to see data parity when doing a forward transform followed by an inverse transform using CUFFT, it's necessary to divide the result by the signal size:

    cuFFT performs un-normalized FFTs; that is, performing a forward FFT on an input data set followed by an inverse FFT on the resulting set yields data that is equal to the input, scaled by the number of elements. Scaling either transform by the reciprocal of the size of the data set is left for the user to perform as seen fit.

    The following code has the above issues addressed and should give you better results:

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <math.h>
    #include <cufft.h>
    #include <cuComplex.h>
    
    #define SIGNAL_SIZE        1024
    
    int main(int argc, char **argv) {
       cudaEvent_t start, stop;
       cudaEventCreate(&start);
       cudaEventCreate(&stop);
    
       // Allocate host memory for the signal
       cuDoubleComplex *h_signal = (cuDoubleComplex *) malloc(sizeof(cuDoubleComplex) * SIGNAL_SIZE);
    
       // Initalize the memory for the signal
       for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
          if((double)i/SIGNAL_SIZE>=0 && (double)i/SIGNAL_SIZE<0.5)  h_signal[i].x = (double)i/SIGNAL_SIZE;
          else if((double)i/SIGNAL_SIZE>=0.5 && (double)i/SIGNAL_SIZE<1)  h_signal[i].x = (double)i/SIGNAL_SIZE-1;
          h_signal[i].y = 0;
       }
    
    // Allocate device memory for signal
       cuDoubleComplex *d_signal;
    
       cudaMalloc((void **) &d_signal, SIGNAL_SIZE*sizeof(cuDoubleComplex));
       // Copy host memory to device
       cudaMemcpy(d_signal, h_signal, SIGNAL_SIZE*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
    
    
    cudaEventRecord(start, 0);
       cufftHandle plan;
       cufftPlan1d(&plan, SIGNAL_SIZE , CUFFT_Z2Z, 1);
    
       // FFT computation
       cufftExecZ2Z(plan, d_signal, d_signal, CUFFT_FORWARD);
    
        cufftExecZ2Z(plan, d_signal, d_signal, CUFFT_INVERSE);
    
       cuDoubleComplex *h_signal_inv =(cuDoubleComplex *) malloc(sizeof(cuDoubleComplex) * SIGNAL_SIZE);
       cudaMemcpy(h_signal_inv, d_signal, sizeof(cuDoubleComplex) * SIGNAL_SIZE, cudaMemcpyDeviceToHost);
    
    
       cudaEventRecord(stop, 0);
       cudaEventSynchronize(stop);
    
       float elapsedTime;
       cudaEventElapsedTime(&elapsedTime, start, stop);
       printf("Elapsed Time:  %3.1f ms\n", elapsedTime);
    
    
        for(int i=0;i<SIGNAL_SIZE;i++) printf("\n%f %f", h_signal[i].x, h_signal_inv[i].x/SIGNAL_SIZE);
    
        cufftDestroy(plan);
    
       free(h_signal);
       free(h_signal_inv);
    
       cudaFree(d_signal);
    
       cudaDeviceReset();
       return 0;
    }