python c cuda indexoutofboundsexception pycuda

PyCUDA using struct properly

I am trying to implement a struct in my Pycuda code but i am getting out of bounds errors. I tried following this tutorial but am unable to get it working for my case.

The problem is most probably due to improper use of pointers, e.g. the tutorial shows that the pointer memsize must be allocated rather than the data memsize. Hopefully someone here can give me some insight...

Sample code:

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit

from mako.template import Template
from pycuda.compiler import SourceModule

src_template = Template(
"""
    struct Dist {
        %for s in xrange(ns):
        float *dist${s};
        %endfor
    };

    // return linear index based on x,y coordinate
    __device__ int get_index(int xcoord, int ycoord)
    {
        return ycoord + xcoord * ${ny};
    };

    __global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
    {
        int idx;
        float dens, velx, vely, vv, ev;

        for (int y = threadIdx.x + blockIdx.x * blockDim.x; 
                 y < ${ny}; 
                 y += blockDim.x * gridDim.x)   
        {   
            for (int x = threadIdx.y + blockIdx.y * blockDim.y; 
                 x < ${nx}; 
                 x += blockDim.y * gridDim.y) 
            {
                if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
                {
                    idx = get_index(x,y);
                    dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
                    vv = velx*velx + vely*vely;

                    %for s in xrange(ns):
                    // s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
                    ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
                    ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
                    %endfor
                }
            }
        }
    }
"""
)

class channelFlow:
    # initialize channelFlow
    def __init__(self, nx, ny):
        self.nx, self.ny = nx, ny

        max_threads_per_block = tools.DeviceData().max_threads
        self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1)  # threads per block
        self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid

        self.ns = 9
        self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
        self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
        self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])

        self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
                     'w': self.w, 'ex': self.ex, 'ey': self.ey
                     }

        dtype = np.float32
        self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
        self.rho  = np.zeros([self.nx,self.ny]).astype(dtype)
        self.ux   = np.zeros([self.nx,self.ny]).astype(dtype)
        self.uy   = np.zeros([self.nx,self.ny]).astype(dtype)

        self.ftmp_gpu = cuda.mem_alloc(self.ftmp.nbytes)
        self.rho_gpu  = cuda.mem_alloc(self.rho.nbytes)
        self.ux_gpu   = cuda.mem_alloc(self.ux.nbytes)
        self.uy_gpu   = cuda.mem_alloc(self.uy.nbytes)

    def run(self):
        src = src_template.render(**self.ctx)
        code = SourceModule(src)
        initialize  = code.get_function('initialize')

        self.rho[:,:] = 1.
        self.ux[:,:] = 0.
        self.uy[:,:] = 0.

        cuda.memcpy_htod(self.rho_gpu, self.rho)
        cuda.memcpy_htod(self.ux_gpu, self.ux)
        cuda.memcpy_htod(self.uy_gpu, self.uy)

        initialize(
            self.rho_gpu, self.ux_gpu, self.uy_gpu,
            self.ftmp_gpu, 
            block=self.blocksize, grid=self.gridsize
            )

if __name__ == "__main__":
    sim = channelFlow(64,64); sim.run()

Solution

I was able to properly implement a structure of arrays in pycuda using the GPUStruct python module available here and by fixing the improper use of pointers such that:

ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);

was changed to:

float *ftmp${s}_ptr = ftmp->dist${s};
ftmp${s}_ptr[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);

The revised code which shows the details of GPUStruct implementation:

#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit

from gpu_struct import GPUStruct
from mako.template import Template
from pycuda.compiler import SourceModule

src_template = Template(
"""
    struct Dist {
        %for s in xrange(ns):
        float *dist${s};
        %endfor
    };

    // return linear index based on x,y coordinate
    __device__ int get_index(int xcoord, int ycoord)
    {
        return ycoord + xcoord * ${ny};
    };

    __global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
    {
        int idx;
        float dens, velx, vely, vv, ev;

        for (int y = threadIdx.x + blockIdx.x * blockDim.x; 
                 y < ${ny}; 
                 y += blockDim.x * gridDim.x)   
        {   
            for (int x = threadIdx.y + blockIdx.y * blockDim.y; 
                 x < ${nx}; 
                 x += blockDim.y * gridDim.y) 
            {
                if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
                {
                    idx = get_index(x,y);
                    dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
                    vv = velx*velx + vely*vely;

                    %for s in xrange(ns):
                    // s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
                    float *ftmp${s}_ptr1 = ftmp->dist${s};
                    ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
                    ftmp${s}_ptr1[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);                   
                    %endfor
                }
            }
        }
    }
"""
)

class channelFlow:
    # initialize channelFlow
    def __init__(self, nx, ny):
        self.nx, self.ny = nx, ny

        max_threads_per_block = tools.DeviceData().max_threads
        self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1)  # threads per block
        self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid

        self.ns = 9
        self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
        self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
        self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])

        self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
                     'w': self.w, 'ex': self.ex, 'ey': self.ey
                     }

        dtype = np.float32
        self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
        self.rho  = np.zeros([self.nx,self.ny]).astype(dtype)
        self.ux   = np.zeros([self.nx,self.ny]).astype(dtype)
        self.uy   = np.zeros([self.nx,self.ny]).astype(dtype)

        self.ftmp_gpu = GPUStruct([
            (np.float32,'*dist0', self.ftmp[:,:,0]),
            (np.float32,'*dist1', self.ftmp[:,:,1]),
            (np.float32,'*dist2', self.ftmp[:,:,2]),
            (np.float32,'*dist3', self.ftmp[:,:,3]),
            (np.float32,'*dist4', self.ftmp[:,:,4]),
            (np.float32,'*dist5', self.ftmp[:,:,5]),
            (np.float32,'*dist6', self.ftmp[:,:,6]),
            (np.float32,'*dist7', self.ftmp[:,:,7]),
            (np.float32,'*dist8', self.ftmp[:,:,8])
            ])
        self.rho_gpu  = cuda.mem_alloc(self.rho.nbytes)
        self.ux_gpu   = cuda.mem_alloc(self.ux.nbytes)
        self.uy_gpu   = cuda.mem_alloc(self.uy.nbytes)

    def run(self):
        src = src_template.render(**self.ctx)
        code = SourceModule(src)
        initialize  = code.get_function('initialize')

        self.rho[:,:] = 1.
        self.ux[:,:] = 0.
        self.uy[:,:] = 0.

        self.ftmp_gpu.copy_to_gpu()
        cuda.memcpy_htod(self.rho_gpu, self.rho)
        cuda.memcpy_htod(self.ux_gpu, self.ux)
        cuda.memcpy_htod(self.uy_gpu, self.uy)

        initialize(
            self.rho_gpu, self.ux_gpu, self.uy_gpu,
            self.ftmp_gpu.get_ptr(), 
            block=self.blocksize, grid=self.gridsize
            )

        self.dens = np.zeros_like(self.rho)
        cuda.memcpy_dtoh(self.dens, self.rho_gpu)
        print self.dens

if __name__ == "__main__":
    sim = channelFlow(64,64); sim.run()