Search code examples
matlabcudafftcufft

CUFFT: trying to implement row by row fft of a matrix


I am trying to replicate matlab fft functionality, where it does a row by row (or column by column) fft of a matrix. Each row would be one of the batches in the cufft plan.

I can get it working using cufftExecC2C (the commented out part in the code below works), but not cufftExecR2C. My code is using cufftPlan1d, but ideally I want to implement it using cufftPlanMany.

I am wondering what I'm doing wrong, and if there is a better way of doing this. Thank you.

// linker -> input -> additional dependencies -> add 'cufft.lib'
// VC++ Directories -> include directories - > add 'C:\ProgramData\NVIDIA Corporation\CUDA Samples\v6.0\common\inc'

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>

#include <iostream>

#define NX 6
#define NY 5

void printArray(float *my_array);
void printComplexArray(float2 *my_array);

int main(){

/************************************************************ C2C ************************************************************/
/*  
    float2 *initial_array = (float2 *)malloc(sizeof(float2) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++){
            initial_array[NY * h + w].x = 0;
            initial_array[NY * h + w].y = 0;
        }
    }
    initial_array[NY*3 + 0].x = 1;
    initial_array[NY*5 + 0].x = 1;

    printComplexArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * NX * NY);

    cufftComplex *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftComplex));
    cudaMalloc((void **)&gpu_transformed_array, NX*NY*sizeof(cufftComplex));

    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float2), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_C2C, NX);

    cufftExecC2C(plan, gpu_initial_array, gpu_transformed_array, CUFFT_FORWARD);

    cudaMemcpy(transformed_array, gpu_transformed_array, NX*NY*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    printComplexArray(transformed_array);
*/
/************************************************************ C2C ************************************************************/

/************************************************************ R2C ************************************************************/

    float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            initial_array[NY * h + w] = 0;
    }

    initial_array[NY*3 + 0] = 1;

    printArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);

    cufftReal *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal));
    cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex));

    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_R2C, NX);

    //                       ***** cufftPlanMany *****
    //int n[2] = {NX, NY};
    //cufftPlanMany(&plan,1,n,NULL,1,0,NULL,1,0,CUFFT_R2C,NX);

    cufftExecR2C(plan, gpu_initial_array, gpu_transformed_array);

    cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    printComplexArray(transformed_array);

/************************************************************ R2C ************************************************************/

    cufftDestroy(plan);
    free(initial_array);
    free(transformed_array);
    cudaFree(gpu_initial_array);
    cudaFree(gpu_transformed_array);

    std::system("pause");
    return 0;
}

void printArray(float *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w] << " | ";
        std::cout << std::endl; 
    }
    std::cout << std::endl;     
}

void printComplexArray(float2 *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w].x << " + " << my_array[NY * h + w].y << " | ";
        std::cout << std::endl;
    }
    std::cout << std::endl; 
}

Solution

  • It seems that your issue resides in the way you print out the result. You cannot use the same routine to print for the two cases of CUFFT_R2C and CUFFT_C2C. In the former case, you have a (NY/2+1)*NX sized output, while the the latter case you have a NY*NX sized output. The fixed code below should work.

    Also, it would be also good to add proper CUDA error check and CUFFT error check, which I have also added to the code below.

    #include <stdio.h>
    #include <stdlib.h>
    #include <cufft.h>
    #include <cuda_runtime.h>
    #include <assert.h>
    
    #include <iostream>
    
    #define NX 6
    #define NY 5
    
    void printArray(float *my_array);
    void printComplexSymmetricArray(float2 *my_array);
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    /*********************/
    /* CUFFT ERROR CHECK */
    /*********************/
    static const char *_cudaGetErrorEnum(cufftResult error)
    {
        switch (error)
        {
            case CUFFT_SUCCESS:
                return "CUFFT_SUCCESS";
    
            case CUFFT_INVALID_PLAN:
                return "CUFFT_INVALID_PLAN";
    
            case CUFFT_ALLOC_FAILED:
                return "CUFFT_ALLOC_FAILED";
    
            case CUFFT_INVALID_TYPE:
                return "CUFFT_INVALID_TYPE";
    
            case CUFFT_INVALID_VALUE:
                return "CUFFT_INVALID_VALUE";
    
            case CUFFT_INTERNAL_ERROR:
                return "CUFFT_INTERNAL_ERROR";
    
            case CUFFT_EXEC_FAILED:
                return "CUFFT_EXEC_FAILED";
    
            case CUFFT_SETUP_FAILED:
                return "CUFFT_SETUP_FAILED";
    
            case CUFFT_INVALID_SIZE:
                return "CUFFT_INVALID_SIZE";
    
            case CUFFT_UNALIGNED_DATA:
                return "CUFFT_UNALIGNED_DATA";
        }
    
        return "<unknown>";
    }
    
    #define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)
    
    inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
    {
        if( CUFFT_SUCCESS != err) {
            fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                                _cudaGetErrorEnum(err)); \
            cudaDeviceReset(); assert(0); \
        }
    }
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
        for (int h = 0; h < NX; h++){
            for (int w = 0; w < NY; w++)
                initial_array[NY * h + w] = 0;
            }
    
        initial_array[NY*3 + 0] = 1;
    
        printArray(initial_array);
    
        float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);
    
        cufftReal *gpu_initial_array;
        cufftComplex *gpu_transformed_array;
    
        gpuErrchk(cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal)));
        gpuErrchk(cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex)));
    
        gpuErrchk(cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice));
    
        cufftHandle plan;
        cufftSafeCall(cufftPlan1d(&plan, NY, CUFFT_R2C, NX));
    
        cufftSafeCall(cufftExecR2C(plan, (cufftReal*)gpu_initial_array, (cufftComplex*)gpu_transformed_array));
    
        gpuErrchk(cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost));
    
        printComplexSymmetricArray(transformed_array);
    
        cufftSafeCall(cufftDestroy(plan));
        free(initial_array);
        free(transformed_array);
        gpuErrchk(cudaFree(gpu_initial_array));
        gpuErrchk(cudaFree(gpu_transformed_array));
    
        std::system("pause");
        return 0;
    }
    
    /***********************/
    /* PRINTOUT REAL ARRAY */
    /***********************/
    void printArray(float *my_array){
        for (int h = 0; h < NX; h++){
            for (int w = 0; w < NY; w++)
                std::cout << my_array[NY * h + w] << " | ";
                std::cout << std::endl; 
            }
        std::cout << std::endl;     
    }
    
    /************************************/
    /* PRINTOUT COMPLEX SYMMETRIC ARRAY */
    /************************************/
    void printComplexSymmetricArray(float2 *my_array){
        for (int h = 0; h < NX; h++){
            for (int w = 0; w < NY/2+1; w++)
                std::cout << my_array[(NY/2+1) * h + w].x << " + " << my_array[(NY/2+1) * h + w].y << " | ";
                std::cout << std::endl;
        }
        std::cout << std::endl; 
    }