CUFFT: trying to implement row by row fft of a matrix

I am trying to replicate matlab fft functionality, where it does a row by row (or column by column) fft of a matrix. Each row would be one of the batches in the cufft plan.

I can get it working using cufftExecC2C (the commented out part in the code below works), but not cufftExecR2C. My code is using cufftPlan1d, but ideally I want to implement it using cufftPlanMany.

I am wondering what I'm doing wrong, and if there is a better way of doing this. Thank you.

// linker -> input -> additional dependencies -> add 'cufft.lib'
// VC++ Directories -> include directories - > add 'C:\ProgramData\NVIDIA Corporation\CUDA Samples\v6.0\common\inc'

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>

#include <iostream>

#define NX 6
#define NY 5

void printArray(float *my_array);
void printComplexArray(float2 *my_array);

int main(){

/************************************************************ C2C ************************************************************/
/*  
    float2 *initial_array = (float2 *)malloc(sizeof(float2) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++){
            initial_array[NY * h + w].x = 0;
            initial_array[NY * h + w].y = 0;
        }
    }
    initial_array[NY*3 + 0].x = 1;
    initial_array[NY*5 + 0].x = 1;

    printComplexArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * NX * NY);

    cufftComplex *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftComplex));
    cudaMalloc((void **)&gpu_transformed_array, NX*NY*sizeof(cufftComplex));

    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float2), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_C2C, NX);

    cufftExecC2C(plan, gpu_initial_array, gpu_transformed_array, CUFFT_FORWARD);

    cudaMemcpy(transformed_array, gpu_transformed_array, NX*NY*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    printComplexArray(transformed_array);
*/
/************************************************************ C2C ************************************************************/

/************************************************************ R2C ************************************************************/

    float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            initial_array[NY * h + w] = 0;
    }

    initial_array[NY*3 + 0] = 1;

    printArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);

    cufftReal *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal));
    cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex));

    cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice);

    cufftHandle plan;
    cufftPlan1d(&plan, NY, CUFFT_R2C, NX);

    //                       ***** cufftPlanMany *****
    //int n[2] = {NX, NY};
    //cufftPlanMany(&plan,1,n,NULL,1,0,NULL,1,0,CUFFT_R2C,NX);

    cufftExecR2C(plan, gpu_initial_array, gpu_transformed_array);

    cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost);

    printComplexArray(transformed_array);

/************************************************************ R2C ************************************************************/

    cufftDestroy(plan);
    free(initial_array);
    free(transformed_array);
    cudaFree(gpu_initial_array);
    cudaFree(gpu_transformed_array);

    std::system("pause");
    return 0;
}

void printArray(float *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w] << " | ";
        std::cout << std::endl; 
    }
    std::cout << std::endl;     
}

void printComplexArray(float2 *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w].x << " + " << my_array[NY * h + w].y << " | ";
        std::cout << std::endl;
    }
    std::cout << std::endl; 
}

Solution

It seems that your issue resides in the way you print out the result. You cannot use the same routine to print for the two cases of CUFFT_R2C and CUFFT_C2C. In the former case, you have a (NY/2+1)*NX sized output, while the the latter case you have a NY*NX sized output. The fixed code below should work.

Also, it would be also good to add proper CUDA error check and CUFFT error check, which I have also added to the code below.

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <assert.h>

#include <iostream>

#define NX 6
#define NY 5

void printArray(float *my_array);
void printComplexSymmetricArray(float2 *my_array);

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
    switch (error)
    {
        case CUFFT_SUCCESS:
            return "CUFFT_SUCCESS";

        case CUFFT_INVALID_PLAN:
            return "CUFFT_INVALID_PLAN";

        case CUFFT_ALLOC_FAILED:
            return "CUFFT_ALLOC_FAILED";

        case CUFFT_INVALID_TYPE:
            return "CUFFT_INVALID_TYPE";

        case CUFFT_INVALID_VALUE:
            return "CUFFT_INVALID_VALUE";

        case CUFFT_INTERNAL_ERROR:
            return "CUFFT_INTERNAL_ERROR";

        case CUFFT_EXEC_FAILED:
            return "CUFFT_EXEC_FAILED";

        case CUFFT_SETUP_FAILED:
            return "CUFFT_SETUP_FAILED";

        case CUFFT_INVALID_SIZE:
            return "CUFFT_INVALID_SIZE";

        case CUFFT_UNALIGNED_DATA:
            return "CUFFT_UNALIGNED_DATA";
    }

    return "<unknown>";
}

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)

inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
    if( CUFFT_SUCCESS != err) {
        fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
                            _cudaGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

/********/
/* MAIN */
/********/
int main(){

    float *initial_array = (float *)malloc(sizeof(float) * NX * NY);
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            initial_array[NY * h + w] = 0;
        }

    initial_array[NY*3 + 0] = 1;

    printArray(initial_array);

    float2 *transformed_array= (float2 *)malloc(sizeof(float2) * (NY/2+1) * NX);

    cufftReal *gpu_initial_array;
    cufftComplex *gpu_transformed_array;

    gpuErrchk(cudaMalloc((void **)&gpu_initial_array, NX*NY*sizeof(cufftReal)));
    gpuErrchk(cudaMalloc((void **)&gpu_transformed_array, (NY/2+1)*NX*sizeof(cufftComplex)));

    gpuErrchk(cudaMemcpy(gpu_initial_array, initial_array, NX*NY*sizeof(float), cudaMemcpyHostToDevice));

    cufftHandle plan;
    cufftSafeCall(cufftPlan1d(&plan, NY, CUFFT_R2C, NX));

    cufftSafeCall(cufftExecR2C(plan, (cufftReal*)gpu_initial_array, (cufftComplex*)gpu_transformed_array));

    gpuErrchk(cudaMemcpy(transformed_array, gpu_transformed_array, NX*(NY/2+1)*sizeof(cufftComplex), cudaMemcpyDeviceToHost));

    printComplexSymmetricArray(transformed_array);

    cufftSafeCall(cufftDestroy(plan));
    free(initial_array);
    free(transformed_array);
    gpuErrchk(cudaFree(gpu_initial_array));
    gpuErrchk(cudaFree(gpu_transformed_array));

    std::system("pause");
    return 0;
}

/***********************/
/* PRINTOUT REAL ARRAY */
/***********************/
void printArray(float *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY; w++)
            std::cout << my_array[NY * h + w] << " | ";
            std::cout << std::endl; 
        }
    std::cout << std::endl;     
}

/************************************/
/* PRINTOUT COMPLEX SYMMETRIC ARRAY */
/************************************/
void printComplexSymmetricArray(float2 *my_array){
    for (int h = 0; h < NX; h++){
        for (int w = 0; w < NY/2+1; w++)
            std::cout << my_array[(NY/2+1) * h + w].x << " + " << my_array[(NY/2+1) * h + w].y << " | ";
            std::cout << std::endl;
    }
    std::cout << std::endl; 
}