Search code examples
c++cudagpunvcc

FFT calculation using GPU: unable to compile program with recursion


I am trying to learn programming a GPU. My system environment is as follows:

OS: windows 10 pro
GPU: NVIDIA GTX 1080 Ti (display does not run on this; there is another gpu for that)
CUDA toolkit: v9.1

I wrote this simple program using CUDA to calculate FFT from scratch on a GPU. The algorithm follows the wikipedia example of Cooley-Tukey algorithm. The code uses recursive functions to calculate the FFT of an array of complex values.

#include <iostream>
#include <string>
#include "conio.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust\complex.h>

#include <cstdio>
#include <fstream>
using namespace std;
#define winSize 2048
#define winShift 1024

#define M_PI 3.14159265358979323846

__device__ void separate(thrust::complex<double>* a, int n)
{
    thrust::complex<double>* b = new thrust::complex<double>[n / 2];  // get temp heap storage
    for (int i = 0; i<n / 2; i++)    // copy all odd elements to heap storage
        b[i] = a[i * 2 + 1];
    for (int i = 0; i<n / 2; i++)    // copy all even elements to lower-half of a[]
        a[i] = a[i * 2];
    for (int i = 0; i<n / 2; i++)    // copy all odd (from heap) to upper-half of a[]
        a[i + n / 2] = b[i];
    cudaFree(b);                 // delete heap storage
}

// N must be a power-of-2, or bad things will happen.
// Currently no check for this condition.
//
// N input samples in X[] are FFT'd and results left in X[].
// Because of Nyquist theorem, N samples means 
// only first N/2 FFT results in X[] are the answer.
// (upper half of X[] is a reflection with no new information).
__global__ void fft2(thrust::complex<double>* X, int N)
{
    if (N < 2)
    {
        // bottom of recursion.
        // Do nothing here, because already X[0] = x[0]
    }
    else
    {
        separate(X, N);      // all evens to lower half, all odds to upper half
        fft2 << <1, 1 >> >(X, N / 2);   // recurse even items
        fft2 << <1, 1 >> >(X + N / 2, N / 2);   // recurse odd  items
                                                // combine results of two half recursions
        for (int k = 0; k<N / 2; k++)
        {
            thrust::complex<double> e = X[k];   // even
            thrust::complex<double> o = X[k + N / 2];   // odd
                                                // w is the "twiddle-factor"
            thrust::complex<double> w = exp(thrust::complex<double>(0, -2.*M_PI*k / N));
            X[k] = e + w * o;
            X[k + N / 2] = e - w * o;
        }
    }
}



int main()
{
    const int nSamples = 64;
    double nSeconds = 0.02;                      // total time for sampling
    double sampleRate = nSamples / nSeconds;    // n Hz = n / second 
    double freqResolution = sampleRate / nSamples; // freq step in FFT result
    thrust::complex<double> x[nSamples];                // storage for sample data
    thrust::complex<double> X[nSamples];                // storage for FFT answer
    thrust::complex<double> *d_arr1;
    const int nFreqs = 5;
    double freq[nFreqs] = { 2,4,8,32,72 }; // known freqs for testing

    size_t n_byte = nSamples * sizeof(complex<double>);
    // generate samples for testing
    for (int i = 0; i<nSamples; i++)
    {
        x[i] = thrust::complex<double>(0., 0.);
        // sum several known sinusoids into x[]
        for (int j = 0; j < nFreqs; j++)
            x[i] += sin(2 * M_PI*freq[j] * i); // / nSamples);
        X[i] = x[i];        // copy into X[] for FFT work & result
    }
    // compute fft for this data

    cudaMalloc((void**)&d_arr1, n_byte);
    cudaMemcpy(d_arr1, X, n_byte, cudaMemcpyHostToDevice);
    //launchKernel << <1, 1 >> >(d_arr1, nSamples);
    fft2 << <1, 1 >> > (d_arr1, nSamples);
    cudaMemcpy(X, d_arr1, n_byte, cudaMemcpyDeviceToHost);

    printf("  n\tx[]\tX[]\tf\n");       // header line
                                        // loop to print values
    for (int i = 0; i<nSamples; i++)
    {
        printf("% 3d\t%+.3f\t%+.3f\t%g\n",
            i, x[i].real(), abs(X[i]), i*freqResolution);
    }

    ofstream myfile("example_cuda.txt");
    printf("I am trying to write to file\n");
    if (myfile.is_open())
    {
        for (int count = 0; count < nSamples; count++)
        {
            myfile << x[count].real() << "," << abs(X[count]) << "," << count*freqResolution << "\n";
        }
        myfile.close();
    }
}

I used the following command to compile the code using VS2015 command prompt:

nvcc -o fft_Wiki2.exe -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu

The compilation itself doesn't show any errors or warnings, but the executable does not run. When I try the

fft_Wiki2.exe

it simply says the version of this executable is incompatible with the 64 bit Windows version and so cannot execute. But I am using the --machine 64 option to force the executable version.

How do I get this program to execute ?


Solution

  • How do I get this program to execute ?

    It isn't a program you are trying to run, it is an object file.

    In your compilation command you pass -c:

    nvcc -o fft_Wiki2.exe -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
    

    which means only compilation and no linking. What you would need to do is something like this:

    nvcc -o fft_Wiki2.obj -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
    
    nvcc -o fft_Wiki2.exe -arch=compute_35 --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.obj
    

    [Note I don't have access to a Windows development platform to check the accuracy of the commands]

    The first command compiles and emits an object file. The second performs both host and device code linking and emits an executable which you should be able to run