In Matlab when, I enter a one dimensional array of complex numbers, I have an output of arrays with real numbers of same size and same dimension. Trying to repeat this in CUDA C, but have different output. Can you please help? In Matlab, when I enter ifft(array)
My arrayOfComplexNmbers:
[4.6500 + 0.0000i 0.5964 - 1.4325i 0.4905 - 0.5637i 0.4286 - 0.2976i 0.4345 - 0.1512i 0.4500 + 0.0000i 0.4345 + 0.1512i 0.4286 + 0.2976i 0.4905 + 0.5637i 0.5964 + 1.4325i]
My arrayOfRealNumbers:
[ 0.9000 0.8000 0.7000 0.6000 0.5000 0.4000 0.3000 0.2000 0.1500 0.1000]
When I enter ifft(arrayOfComplexNmbers)
in Matlab, my output is arrayOfRealNumbers
.
Thank you!
This is my CUDA code for this:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "device_functions.h"
#define NX 256
#define NY 128
#define NRANK 2
#define BATCH 1
#define SIGNAL_SIZE 10
typedef float2 Complex;
__global__ void printCUDAVariables_1(cufftComplex *cudaSignal){
int index = threadIdx.x + blockIdx.x*blockDim.x;
printf("COMPLEX CUDA %d %f %f \n", index, cudaSignal[index].x, cudaSignal[index].y);
}
__global__ void printCUDAVariables_2(cufftReal *cudaSignal){
int index = threadIdx.x + blockIdx.x*blockDim.x;
printf("REAL CUDA %d %f \n", index, cudaSignal);
}
int main() {
cufftHandle plan;
//int n[NRANK] = { NX, NY };
Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE);
float *r_signal = 0;
if (r_signal != 0){
r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float));
}
else{
r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float));
}
int mem_size = sizeof(Complex)* SIGNAL_SIZE * 2;
h_signal[0].x = (float)4.65;
h_signal[0].y = (float)0;
h_signal[1].x = (float)0.5964;
h_signal[1].y = (float)0;
h_signal[2].x = (float)4.65;
h_signal[2].y = (float)-1.4325;
h_signal[3].x = (float)0.4905;
h_signal[3].y = (float)0.5637;
h_signal[4].x = (float)0.4286;
h_signal[4].y = (float)-0.2976;
h_signal[5].x = (float)0.4345;
h_signal[5].y = (float)-0.1512;
h_signal[6].x = (float)0.45;
h_signal[6].y = (float)0;
h_signal[7].x = (float)0.4345;
h_signal[7].y = (float)-0.1512;
h_signal[8].x = (float)0.4286;
h_signal[8].y = (float)0.2976;
h_signal[9].x = (float)0.4905;
h_signal[9].y = (float)-0.5637;
h_signal[10].x = (float)0.5964;
h_signal[10].y = (float)1.4325;
//for (int i = 0; i < SIGNAL_SIZE; i++){
// printf("RAW %f %f\n", h_signal[i].x, h_signal[i].y);
//}
//allocate device memory for signal
cufftComplex *d_signal, *d_signal_out;
cudaMalloc(&d_signal, mem_size);
cudaMalloc(&d_signal_out, mem_size);
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);
printCUDAVariables_1 << <10, 1 >> >(d_signal);
//cufftReal *odata;
//cudaMalloc((void **)&odata, sizeof(cufftReal)*NX*(NY / 2 + 1));
//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE);
//cufftExecC2R(plan, d_signal, odata);
cudaDeviceSynchronize();
printCUDAVariables_1 << <10, 1 >> >(d_signal_out);
//printCUDAVariables_2 << <10, 1 >> >(odata);
//cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*2*sizeof(float), cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(d_signal);
cudaFree(d_signal_out);
return 0;
}
When computing ifft
with MATLAB, the default behavior is as follows:
Your CUFFT code is correct in flow, but a bit different parameters compared to MATLAB are causing the current output.
NX
constant to be specific is causing the input signal to be
zero-padded to a length of 256. To achieve MATLAB's behavior, keep NX
equal to SIGNAL_SIZE
.SIGNAL_SIZE
to get the actual values.0
to SIGNAL_SIZE-1
indices only.Here is the fixed code which is providing the same output as MATLAB.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "device_functions.h"
#define NX 10
#define NY 1
#define NRANK 1
#define BATCH 1
#define SIGNAL_SIZE 10
typedef float2 Complex;
int main()
{
cufftHandle plan;
//int n[NRANK] = { NX, NY };
Complex *h_signal = (Complex *)malloc(sizeof(Complex)* SIGNAL_SIZE);
float *r_signal = 0;
if (r_signal != 0)
{
r_signal = (float*)realloc(r_signal, SIGNAL_SIZE * sizeof(float));
}
else
{
r_signal = (float*)malloc(SIGNAL_SIZE * sizeof(float));
}
int mem_size = sizeof(Complex)* SIGNAL_SIZE;
h_signal[0].x = (float)4.65;
h_signal[0].y = (float)0;
h_signal[1].x = (float)0.5964;
h_signal[1].y = (float)-1.4325;
h_signal[2].x = (float)0.4905;
h_signal[2].y = (float)-0.5637;
h_signal[3].x = (float)0.4286;
h_signal[3].y = (float)-0.2976;
h_signal[4].x = (float)0.4345;
h_signal[4].y = (float)-0.1512;
h_signal[5].x = (float)0.45;
h_signal[5].y = (float)0.0;
h_signal[6].x = (float)0.4345;
h_signal[6].y = (float)0.1512;
h_signal[7].x = (float)0.4286;
h_signal[7].y = (float)0.2976;
h_signal[8].x = (float)0.4905;
h_signal[8].y = (float)0.5637;
h_signal[9].x = (float)0.5964;
h_signal[9].y = (float)1.4325;
printf("\nInput:\n");
for(int i=0; i<SIGNAL_SIZE; i++)
{
char op = h_signal[i].y < 0 ? '-' : '+';
printf("%f %c %fi\n", h_signal[i].x/SIGNAL_SIZE, op, fabsf(h_signal[i].y/SIGNAL_SIZE ) );
}
//allocate device memory for signal
cufftComplex *d_signal, *d_signal_out;
cudaMalloc(&d_signal, mem_size);
cudaMalloc(&d_signal_out, mem_size);
cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice);
//cufftPlan1d(&plan, SIGNAL_SIZE, CUFFT_C2R, BATCH);
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
cufftExecC2C(plan, d_signal, d_signal_out, CUFFT_INVERSE);
cudaDeviceSynchronize();
cudaMemcpy(h_signal, d_signal_out, SIGNAL_SIZE*sizeof(Complex), cudaMemcpyDeviceToHost);
printf("\n\n-------------------------------\n\n");
printf("Output:\n");
for(int i=0; i<SIGNAL_SIZE; i++)
{
char op = h_signal[i].y < 0 ? '-' : '+';
printf("%f %c %fi\n", h_signal[i].x/SIGNAL_SIZE, op, fabsf(h_signal[i].y/SIGNAL_SIZE ) );
}
cufftDestroy(plan);
cudaFree(d_signal);
cudaFree(d_signal_out);
return 0;
}
The output is still in complex form but the imaginary components are close to zero. Also, the difference in precision of real component is because MATLAB uses double precision by default while this code is based on single precision values.
Compiled and tested with following command on Ubuntu 14.04, CUDA 8.0:
nvcc -o ifft ifft.cu -arch=sm_61 -lcufft
Compared output with MATLAB 2017a.
Input:
0.465000 + 0.000000i
0.059640 - 0.143250i
0.049050 - 0.056370i
0.042860 - 0.029760i
0.043450 - 0.015120i
0.045000 + 0.000000i
0.043450 + 0.015120i
0.042860 + 0.029760i
0.049050 + 0.056370i
0.059640 + 0.143250i
-------------------------------
Output:
0.900000 - 0.000000i
0.800026 - 0.000000i
0.699999 - 0.000000i
0.599964 - 0.000000i
0.500011 + 0.000000i
0.400000 + 0.000000i
0.299990 + 0.000000i
0.199993 + 0.000000i
0.150000 + 0.000000i
0.100018 - 0.000000i