I'm following a tutorial on using the cuFFT
library here: http://gpgpu.org/static/sc2007/SC07_CUDA_3_Libraries.pdf
After following line by line of its code, I'm getting really strange results.
I have input data that is an NxN
array of float
s. The program does a FFT
forward transform, solves Poisson's equation, and then does an inverse FFT
. The input data (and output data) is referred to as a square image with sidelength N
. When I comment out solve_poisson <<<dimGrid, dimBlock>>> (r_complex_d, kx_d, ky_d, N);
, it correctly forward transforms the data and then performs an inverse transform, which causes the output data to be the same as the input data. This is supposed to happen.
Here is the output without calling the solve_poisson
method.
0 r_initial: 0.00125126 r: 0.00125132
1 r_initial: 0.563585 r: 0.563585
2 r_initial: 0.193304 r: 0.193304
3 r_initial: 0.80874 r: 0.80874
4 r_initial: 0.585009 r: 0.585009
5 r_initial: 0.479873 r: 0.479873
6 r_initial: 0.350291 r: 0.350291
7 r_initial: 0.895962 r: 0.895962
8 r_initial: 0.82284 r: 0.82284
9 r_initial: 0.746605 r: 0.746605
10 r_initial: 0.174108 r: 0.174108
11 r_initial: 0.858943 r: 0.858943
12 r_initial: 0.710501 r: 0.710502
13 r_initial: 0.513535 r: 0.513535
14 r_initial: 0.303995 r: 0.303995
15 r_initial: 0.0149846 r: 0.0149846
Press any key to continue . . .
However, when I uncomment out the solve_poisson
method, the output data is inf
or nan
, which leads me to believe that the scale variable was somehow close to zero in the solve_poisson
method.
So I changed float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]);
to float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f
. This change is not in the original tutorial. The results computed here are not supposed to have extreme positive or negative values.
0 r_initial: 0.00125126 r: -11448.1
1 r_initial: 0.563585 r: 11449.3
2 r_initial: 0.193304 r: -11448.3
3 r_initial: 0.80874 r: 11449.2
4 r_initial: 0.585009 r: 11449.4
5 r_initial: 0.479873 r: -11448.4
6 r_initial: 0.350291 r: 11449.5
7 r_initial: 0.895962 r: -11448.6
8 r_initial: 0.82284 r: -11448.5
9 r_initial: 0.746605 r: 11449.4
10 r_initial: 0.174108 r: -11448.3
11 r_initial: 0.858943 r: 11449.3
12 r_initial: 0.710501 r: 11449.2
13 r_initial: 0.513535 r: -11448.4
14 r_initial: 0.303995 r: 11449.3
15 r_initial: 0.0149846 r: -11448.1
Press any key to continue . . .
In the tutorial, a sample calculation on slide 43
on page 22
is computed=0.975879 reference=0.975882
, yet my results are completely different and really large.
The following code is what I used.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>
#include <stdlib.h>
#include <iostream>
#define N 4 //4 X 4 // N is the sidelength of the image -> 16 pixels in entire image
#define block_size_x 2
#define block_size_y 2
__global__ void real2complex(cufftComplex *c, float *a, int n);
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n);
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n);
int main()
{
float *kx, *ky, *r;
kx = (float *)malloc(sizeof(float) * N);
ky = (float *)malloc(sizeof(float) * N);
r = (float *)malloc(sizeof(float) * N * N);
float *kx_d, *ky_d, *r_d;
cufftComplex *r_complex_d;
cudaMalloc((void **)&kx_d, sizeof(float) * N);
cudaMalloc((void **)&ky_d, sizeof(float) * N);
cudaMalloc((void **)&r_d, sizeof(float) * N * N);
cudaMalloc((void **)&r_complex_d, sizeof(cufftComplex) * N * N);
for (int y = 0; y < N; y++)
for (int x = 0; x < N; x++)
r[x + y * N] = rand() / (float)RAND_MAX;
//r[x + y * N] = sin(exp(-((x - N / 2.0f) * (x - N / 2.0f) + (N / 2.0f - y) * (N / 2.0f - y)) / (20 * 20))) * 255 / sin(1); //Here is sample data that will high values at the center of the image and low values as you go farther and farther away from the center.
float* r_inital = (float *)malloc(sizeof(float) * N * N);
for (int i = 0; i < N * N; i++)
r_inital[i] = r[i];
for (int i = 0; i < N; i++)
{
kx[i] = i - N / 2.0f; //centers kx values to be at center of image
ky[i] = N / 2.0f - i; //centers ky values to be at center of image
}
cudaMemcpy(kx_d, kx, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(ky_d, ky, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(r_d, r, sizeof(float) * N * N, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, N, N, CUFFT_C2C);
/* Compute the execution configuration, block_size_x*block_size_y = number of threads */
dim3 dimBlock(block_size_x, block_size_y);
dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);
/* Handle N not multiple of block_size_x or block_size_y */
if (N % block_size_x != 0) dimGrid.x += 1;
if (N % block_size_y != 0) dimGrid.y += 1;
real2complex << < dimGrid, dimBlock >> > (r_complex_d, r_d, N);
cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
solve_poisson << <dimGrid, dimBlock >> > (r_complex_d, kx_d, ky_d, N);
cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);
float scale = 1.0f / (N * N);
complex2real_scaled << <dimGrid, dimBlock >> > (r_d, r_complex_d, scale, N);
cudaMemcpy(r, r_d, sizeof(float) * N * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N * N; i++)
std::cout << i << "\tr_initial: " << r_inital[i] << "\tr: " << r[i] << std::endl;
system("pause");
/* Destroy plan and clean up memory on device*/
free(kx);
free(ky);
free(r);
free(r_inital);
cufftDestroy(plan);
cudaFree(r_complex_d);
cudaFree(kx_d);
}
__global__ void real2complex(cufftComplex *c, float *a, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
c[index].x = a[index];
c[index].y = 0.0f;
}
}
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
a[index] = scale * c[index].x;
}
}
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f;
if (idx == 0 && idy == 0) scale = 1.0f;
scale = 1.0f / scale;
c[index].x *= scale;
c[index].y *= scale;
}
}
Is there anything I messed up on? I would really appreciate if anyone could help me out.
As it shows in the tutorial, the Matlab implementation on slide 33 on page 17 shows that the Poisson calculations are based on the top left corner of the screen as the origin. The x and y data values are then x = (0:(N-1))*h;
and y = (0:(N-1))*h;
, which is why the meshgrid created from these x and y values both start from 0 and increase, as shown on the graph's x and y axes on slide 31 on page 16. In this case, where the image length was 1 (I refer to the input data of the NxN float array or the meshgrid as an image), the center of the image is actually (0.5, 0.5). I wanted to translate these points, so the center point would instead be (0,0) and followed a typical representation of the Cartesian Plane.
So in my code, instead of the Matlab code of
x = (0:(N-1))*h;
y = (0:(N-1))*h;
which could be implemented as
for (int i = 0; i < N; i++)
{
kx[i] = i;
ky[i] = i;
}
I replaced it with
for (int i = 0; i < N; i++)
{
kx[i] = i - N / 2.0f; //centers kx values to be at center of image
ky[i] = N / 2.0f - i; //centers ky values to be at center of image
}
However, I had forgot to change the Poisson calculation so it recognizes the center of the image as the origin instead of the top right corner as the origin. So as Mr. Robert Crovella said, I would have to
change this line:
if (idx == 0 && idy == 0) scale = 1.0f;
to this:if (idx == 2 && idy == 2) scale = 1.0f;
for the case where the image length, or N, is 4.
To generalize this for any image length, this line of code could be then changed to if (idx == n/2 && idy == n/2) scale = 1.0f;