I am trying to make FFT plus kernel calculation. FFT : managedCUDA library kernel calc : own kernel
C# code
public void cuFFTreconstruct() {
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");
float[] fData = new float[Resolution * Resolution * 2];
float[] result = new float[Resolution * Resolution * 2];
CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
for (j = 0; j < Resolution; j++)
fData[(i * Resolution + j) * 2] = i + j * 2;
fData[(i * Resolution + j) * 2 + 1] = 0.0f;
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
for (i = 0; i < Resolution; i++)
for (j = 0; j < Resolution; j++)
ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
kernel code
//Includes for IntelliSense
#ifndef __CUDACC__
#define __CUDACC__
#ifndef __cplusplus
#define __cplusplus
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
// Texture reference
texture<float2, 2> texref;
extern "C"
__global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
However this program does not work well. Following error was occurred:
ErrorLaunchFailed: An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.
Thank you for the message.
host code
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Drawing.Imaging;
using ManagedCuda;
using ManagedCuda.CudaFFT;
using ManagedCuda.VectorTypes;
namespace WFA_CUDA_FFT
public partial class CuFFTMain : Form
float[, ,] FFTData2D;
int Resolution;
const int cuda_blockNum = 256;
public CuFFTMain()
Resolution = 1024;
private void button1_Click(object sender, EventArgs e)
public void cuFFTreconstruct()
CudaContext ctx = new CudaContext(0);
ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);
float2[] fData = new float2[Resolution * Resolution];
float2[] result = new float2[Resolution * Resolution];
FFTData2D = new float[Resolution, Resolution, 2];
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
for (j = 0; j < Resolution; j++)
fData[i * Resolution + j].x = i + j * 2;
avrg += fData[i * Resolution + j].x;
fData[i * Resolution + j].y = 0.0f;
avrg = avrg / (double)(Resolution * Resolution);
for (i = 0; i < Resolution; i++)
for (j = 0; j < Resolution; j++)
fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
for (i = 0; i < Resolution; i++)
for (j = 0; j < Resolution; j++)
FFTData2D[i, j, 0] = result[i * Resolution + j].x;
FFTData2D[i, j, 1] = result[i * Resolution + j].y;
//Clean up
kernel code
//Includes for IntelliSense
#ifndef __CUDACC__
#define __CUDACC__
#ifndef __cplusplus
#define __cplusplus
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
#include <vector>
// Texture reference
texture<float2, 2> texref;
extern "C"
// Device code
__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].x;
data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].y;
First I compiled with .Net4.5. This program did not work, and error (System.BadImageFormatException) was showed. However when the FFT function is comment out, the kernel program run.
Second I chaneged from .Net 4.5 to .Net 4.0. The FFT function works, but kernel does not run and shows errors.
My PC is windows 8.1 pro and I use visual studio 2013.