Search code examples
pythoncudagpugpgpupycuda

How to use shared memory in PyCuda, LogicError: cuModuleLoadDataEx failed: an illegal memory access was encountered


I'm trying to understand how to work with shared memory using PyCuda. Running this code to flip an input vector:

import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

n = 20

input = np.random.randint(10, size=(n))
output = np.zeros_like(input)

input = input.astype(np.int32)
output = output.astype(np.int32)

mod = SourceModule(
'''
  __global__ void flipVectorSM(int* in, int* out, int n) {
    extern __shared__ int sData[];
    int inOffSet = blockDim.x * blockIdx.x;
    int index = inOffSet + threadIdx.x;
    if (index < n) {
        sData[blockDim.x - 1 - threadIdx.x] = in[index];
        __syncthreads();
    }
    int outOffSet = blockDim.x * (gridDim.x - 1 - blockIdx.x);
    int outIndex = outOffSet + threadIdx.x;
    out[outIndex] = sData[threadIdx.x];
  }
'''
)

flip = mod.get_function('flipVectorSM')
flip(drv.In(input), drv.InOut(output), np.int32(n), block=(4, 1, 1), grid=(1, 1), shared=4)

I get this error:

---------------------------------------------------------------------------
LogicError                                Traceback (most recent call last)
<ipython-input-114-5b681ffa31fc> in <cell line: 15>()
     13 output = output.astype(np.int32)
     14 
---> 15 mod = SourceModule(
     16 '''
     17   __global__ void flipVectorSM(int* in, int* out, int n) {

/usr/local/lib/python3.10/dist-packages/pycuda/compiler.py in __init__(self, source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs)
    367         from pycuda.driver import module_from_buffer
    368 
--> 369         self.module = module_from_buffer(cubin)
    370 
    371         self._bind_module()

LogicError: cuModuleLoadDataEx failed: an illegal memory access was encountered - 

I've used the code segment about the global and shared memory that I've used for the same code but using CUDA-C and it works. How I can solve it?


Solution

  • I've changed the size of the thread block using the same dimension of the input vector. The number of the elements of the input array as shared memory's size is enough, so with this configuration it works, thank's.

    import pycuda.driver as drv
    import pycuda.gpuarray as gpuarray
    import pycuda.autoinit
    from pycuda.compiler import SourceModule
    import numpy as np
    
    n = 20
    
    input = np.random.randint(10, size=(n))
    output = np.zeros_like(input)
    
    input = input.astype(np.int32)
    output = output.astype(np.int32)
    
    mod = SourceModule(
    '''
      __global__ void flipVectorSM(int* in, int* out, int n) {
        extern __shared__ int sData[];
        int inOffSet = blockDim.x * blockIdx.x;
        int index = inOffSet + threadIdx.x;
        if (index < n) {
            sData[blockDim.x - 1 - threadIdx.x] = in[index];
            __syncthreads();
        }
        int outOffSet = blockDim.x * (gridDim.x - 1 - blockIdx.x);
        int outIndex = outOffSet + threadIdx.x;
        out[outIndex] = sData[threadIdx.x];
      }
    '''
    )
    
    flip = mod.get_function('flipVectorSM')
    flip(drv.In(input), drv.InOut(output), np.int32(n), block=(20, 1, 1), grid=(1, 1), shared=20)
    
    print("Input vector:")
    print(input)
    print("\nOutput vector:")
    print(output)