Search code examples
c#cudamanaged-cuda

1D FFT plus kernel calculation with managedCUDA


I am trying to make FFT plus kernel calculation. FFT : managedCUDA library kernel calc : own kernel

C# code

public void cuFFTreconstruct() {
                CudaContext ctx = new CudaContext(0);
                CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");

                float[] fData = new float[Resolution * Resolution * 2];
                float[] result = new float[Resolution * Resolution * 2];
                CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
                CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);

                int i, j;
                Random rnd = new Random();
                double avrg = 0.0;

                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        fData[(i * Resolution + j) * 2] = i + j * 2;
                        fData[(i * Resolution + j) * 2 + 1] = 0.0f;
                    }
                }

                devData.CopyToDevice(fData);

                CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
                plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);

                cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
                cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);

                cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);

                devData.CopyToHost(result);

                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
                        ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
                    }
                }   
                ctx.FreeMemory(devData.DevicePointer);
                ctx.FreeMemory(copy_devData.DevicePointer);
            }

kernel code

    //Includes for IntelliSense 
    #define _SIZE_T_DEFINED
    #ifndef __CUDACC__
    #define __CUDACC__
    #endif
    #ifndef __cplusplus
    #define __cplusplus
    #endif


    #include <cuda.h>
    #include <device_launch_parameters.h>
    #include <texture_fetch_functions.h>
    #include "float.h"
    #include <builtin_types.h>
    #include <vector_functions.h>

    // Texture reference
    texture<float2, 2> texref;

    extern "C"
    {
        __global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
        {
            int image_x = blockIdx.x * blockDim.x + threadIdx.x;
            int image_y = blockIdx.y;

            data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
            data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
        }
    }

However this program does not work well. Following error was occurred:

ErrorLaunchFailed: An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.


Solution

  • Thank you for the message.

    host code

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    using System.Drawing.Imaging;
    using ManagedCuda;
    using ManagedCuda.CudaFFT;
    using ManagedCuda.VectorTypes;
    
    
    namespace WFA_CUDA_FFT
    {
        public partial class CuFFTMain : Form
        {
            float[, ,] FFTData2D;
            int Resolution;
    
            const int cuda_blockNum = 256;
    
            public CuFFTMain()
            {
                InitializeComponent();
                Resolution = 1024;
            }
    
            private void button1_Click(object sender, EventArgs e)
            {
                cuFFTreconstruct();
            }
            public void cuFFTreconstruct()
            {
                CudaContext ctx = new CudaContext(0);
                ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
                CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);
                float2[] fData = new float2[Resolution * Resolution];
                float2[] result = new float2[Resolution * Resolution];
                FFTData2D = new float[Resolution, Resolution, 2];
                CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
                CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
    
                int i, j;
                Random rnd = new Random();
                double avrg = 0.0;
    
                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        fData[i * Resolution + j].x = i + j * 2;
                        avrg += fData[i * Resolution + j].x;
                        fData[i * Resolution + j].y = 0.0f;
                    }
                }
    
                avrg = avrg / (double)(Resolution * Resolution);
    
                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
                    }
                }
    
                devData.CopyToDevice(fData);
    
                CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
                plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
    
                cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
                cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);
    
                cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
    
                copy_devData.CopyToHost(result);
    
                for (i = 0; i < Resolution; i++)
                {
                    for (j = 0; j < Resolution; j++)
                    {
                        FFTData2D[i, j, 0] = result[i * Resolution + j].x;
                        FFTData2D[i, j, 1] = result[i * Resolution + j].y;
                    }
                }
    
                //Clean up
                devData.Dispose();
                copy_devData.Dispose();
                plan1D.Dispose();
                CudaContext.ProfilerStop();
                ctx.Dispose();
            }
        }
    }
    

    kernel code

    //Includes for IntelliSense 
    #define _SIZE_T_DEFINED
    #ifndef __CUDACC__
    #define __CUDACC__
    #endif
    #ifndef __cplusplus
    #define __cplusplus
    #endif
    
    
    #include <cuda.h>
    #include <device_launch_parameters.h>
    #include <texture_fetch_functions.h>
    #include "float.h"
    #include <builtin_types.h>
    #include <vector_functions.h>
    #include <vector>
    
    // Texture reference
    texture<float2, 2> texref;
    
    extern "C"
    {
        // Device code
    
        __global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
        {
            int image_x = blockIdx.x * blockDim.x + threadIdx.x;
            int image_y = blockIdx.y;
    
            data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].x;
            data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].y;
        }
    }
    

    First I compiled with .Net4.5. This program did not work, and error (System.BadImageFormatException) was showed. However when the FFT function is comment out, the kernel program run.

    Second I chaneged from .Net 4.5 to .Net 4.0. The FFT function works, but kernel does not run and shows errors.

    My PC is windows 8.1 pro and I use visual studio 2013.