How to: CUDA IFFT

In Matlab when, I enter a one dimensional array of complex numbers, I have an output of arrays with real numbers of same size and same dimension. Trying to repeat this in CUDA C, but have different output. Can you please help? In Matlab, when I enter ifft(array)

My arrayOfComplexNmbers:

[4.6500 + 0.0000i   0.5964 - 1.4325i   0.4905 - 0.5637i   0.4286 - 0.2976i   0.4345 - 0.1512i   0.4500 + 0.0000i   0.4345 + 0.1512i  0.4286 + 0.2976i   0.4905 + 0.5637i   0.5964 + 1.4325i]

My arrayOfRealNumbers:

[ 0.9000    0.8000    0.7000    0.6000    0.5000    0.4000    0.3000    0.2000    0.1500    0.1000]

When I enter ifft(arrayOfComplexNmbers) in Matlab, my output is arrayOfRealNumbers. Thank you! This is my CUDA code for this:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "device_functions.h"

#define NX 256
#define NY 128
#define NRANK 2
#define BATCH 1
#define SIGNAL_SIZE 10

typedef float2 Complex;
__global__ void printCUDAVariables_1(cufftComplex *cudaSignal){
int index = threadIdx.x + blockIdx.x*blockDim.x;    
printf("COMPLEX CUDA %d %f %f \n", index, cudaSignal[index].x, cudaSignal[index].y);
}

__global__ void printCUDAVariables_2(cufftReal *cudaSignal){
int index = threadIdx.x + blockIdx.x*blockDim.x;
printf("REAL CUDA %d %f \n", index, cudaSignal);
}


int main() {
cufftHandle plan;
//int n[NRANK] = { NX, NY };
Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE);
float *r_signal = 0;
if (r_signal != 0){
    r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float));
}
else{
    r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float));
}
int mem_size = sizeof(Complex)* SIGNAL_SIZE * 2;

h_signal[0].x = (float)4.65;
h_signal[0].y = (float)0;

h_signal[1].x = (float)0.5964;
h_signal[1].y = (float)0;

h_signal[2].x = (float)4.65;
h_signal[2].y = (float)-1.4325;

h_signal[3].x = (float)0.4905;
h_signal[3].y = (float)0.5637;

h_signal[4].x = (float)0.4286;
h_signal[4].y = (float)-0.2976;

h_signal[5].x = (float)0.4345;
h_signal[5].y = (float)-0.1512;

h_signal[6].x = (float)0.45;
h_signal[6].y = (float)0;

h_signal[7].x = (float)0.4345;
h_signal[7].y = (float)-0.1512;

h_signal[8].x = (float)0.4286;
h_signal[8].y = (float)0.2976;

h_signal[9].x = (float)0.4905;
h_signal[9].y = (float)-0.5637;

h_signal[10].x = (float)0.5964;
h_signal[10].y = (float)1.4325;
//for (int i = 0; i < SIGNAL_SIZE; i++){
//  printf("RAW %f %f\n", h_signal[i].x, h_signal[i].y);
//}
//allocate device memory for signal
cufftComplex *d_signal, *d_signal_out;
cudaMalloc(&d_signal, mem_size);    
cudaMalloc(&d_signal_out, mem_size);
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);
printCUDAVariables_1 << <10, 1 >> >(d_signal);
//cufftReal *odata;
//cudaMalloc((void **)&odata, sizeof(cufftReal)*NX*(NY / 2 + 1));

//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);    
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE);
//cufftExecC2R(plan, d_signal, odata);
cudaDeviceSynchronize();
printCUDAVariables_1 << <10, 1 >> >(d_signal_out);
//printCUDAVariables_2 << <10, 1 >> >(odata);
//cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*2*sizeof(float), cudaMemcpyDeviceToHost);

cufftDestroy(plan);
cudaFree(d_signal);
cudaFree(d_signal_out);

return 0;
}

Solution

When computing ifft with MATLAB, the default behavior is as follows:

No zero padding of the input signal
No scaling of the output signal

Your CUFFT code is correct in flow, but a bit different parameters compared to MATLAB are causing the current output.

The NX constant to be specific is causing the input signal to be zero-padded to a length of 256. To achieve MATLAB's behavior, keep NX equal to SIGNAL_SIZE.
CUFFT multiples the output signal values with the length of the input signal. You have to divide the output values with SIGNAL_SIZE to get the actual values.
Another important issue is that you are performing out-of-bound access at the time of initializing the input signal. The signal length is 10, but you are initializing the value at 10th index which is out of bound. I assume that this may be due to confusion caused by 1 based indexing of MATLAB. The input signal must be initialized from 0 to SIGNAL_SIZE-1 indices only.
It is not recommended to visualize the signal using a CUDA kernel as the printing may be out of order. You should copy the results back to host and print them serially.

Here is the fixed code which is providing the same output as MATLAB.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "device_functions.h"

#define NX 10
#define NY 1
#define NRANK 1
#define BATCH 1
#define SIGNAL_SIZE 10

typedef float2 Complex;  

int main() 
{
cufftHandle plan;
//int n[NRANK] = { NX, NY };
Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE);
float *r_signal = 0;
if (r_signal != 0)
{
    r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float));
}
else
{
    r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float));
}
int mem_size = sizeof(Complex)* SIGNAL_SIZE;

h_signal[0].x = (float)4.65;
h_signal[0].y = (float)0;

h_signal[1].x = (float)0.5964;
h_signal[1].y = (float)-1.4325;

h_signal[2].x = (float)0.4905;
h_signal[2].y = (float)-0.5637;

h_signal[3].x = (float)0.4286;
h_signal[3].y = (float)-0.2976;

h_signal[4].x = (float)0.4345;
h_signal[4].y = (float)-0.1512;

h_signal[5].x = (float)0.45;
h_signal[5].y = (float)0.0;

h_signal[6].x = (float)0.4345;
h_signal[6].y = (float)0.1512;

h_signal[7].x = (float)0.4286;
h_signal[7].y = (float)0.2976;

h_signal[8].x = (float)0.4905;
h_signal[8].y = (float)0.5637;

h_signal[9].x = (float)0.5964;
h_signal[9].y = (float)1.4325;

printf("\nInput:\n");
for(int i=0; i<SIGNAL_SIZE; i++)
{
    char op = h_signal[i].y < 0 ? '-' : '+';
    printf("%f %c %fi\n", h_signal[i].x/SIGNAL_SIZE, op, fabsf(h_signal[i].y/SIGNAL_SIZE ) );
}

//allocate device memory for signal
cufftComplex *d_signal, *d_signal_out;
cudaMalloc(&d_signal, mem_size);    
cudaMalloc(&d_signal_out, mem_size);
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);


//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);    
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);

cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE);

cudaDeviceSynchronize();

cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);


printf("\n\n-------------------------------\n\n");
printf("Output:\n");
for(int i=0; i<SIGNAL_SIZE; i++)
{
    char op = h_signal[i].y < 0 ? '-' : '+';
    printf("%f %c %fi\n", h_signal[i].x/SIGNAL_SIZE, op, fabsf(h_signal[i].y/SIGNAL_SIZE ) );
}

cufftDestroy(plan);
cudaFree(d_signal);
cudaFree(d_signal_out);

return 0;
}

The output is still in complex form but the imaginary components are close to zero. Also, the difference in precision of real component is because MATLAB uses double precision by default while this code is based on single precision values.

Compiled and tested with following command on Ubuntu 14.04, CUDA 8.0:

nvcc -o ifft ifft.cu -arch=sm_61 -lcufft

Compared output with MATLAB 2017a.

Program Output:

Input:
0.465000 + 0.000000i
0.059640 - 0.143250i
0.049050 - 0.056370i
0.042860 - 0.029760i
0.043450 - 0.015120i
0.045000 + 0.000000i
0.043450 + 0.015120i
0.042860 + 0.029760i
0.049050 + 0.056370i
0.059640 + 0.143250i


-------------------------------

Output:
0.900000 - 0.000000i
0.800026 - 0.000000i
0.699999 - 0.000000i
0.599964 - 0.000000i
0.500011 + 0.000000i
0.400000 + 0.000000i
0.299990 + 0.000000i
0.199993 + 0.000000i
0.150000 + 0.000000i
0.100018 - 0.000000i