I am trying to implement a struct in my Pycuda code but i am getting out of bounds errors. I tried following this tutorial but am unable to get it working for my case.
The problem is most probably due to improper use of pointers, e.g. the tutorial shows that the pointer memsize must be allocated rather than the data memsize. Hopefully someone here can give me some insight...
Sample code:
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit
from mako.template import Template
from pycuda.compiler import SourceModule
src_template = Template(
"""
struct Dist {
%for s in xrange(ns):
float *dist${s};
%endfor
};
// return linear index based on x,y coordinate
__device__ int get_index(int xcoord, int ycoord)
{
return ycoord + xcoord * ${ny};
};
__global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
{
int idx;
float dens, velx, vely, vv, ev;
for (int y = threadIdx.x + blockIdx.x * blockDim.x;
y < ${ny};
y += blockDim.x * gridDim.x)
{
for (int x = threadIdx.y + blockIdx.y * blockDim.y;
x < ${nx};
x += blockDim.y * gridDim.y)
{
if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
{
idx = get_index(x,y);
dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
vv = velx*velx + vely*vely;
%for s in xrange(ns):
// s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
%endfor
}
}
}
}
"""
)
class channelFlow:
# initialize channelFlow
def __init__(self, nx, ny):
self.nx, self.ny = nx, ny
max_threads_per_block = tools.DeviceData().max_threads
self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1) # threads per block
self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid
self.ns = 9
self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])
self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
'w': self.w, 'ex': self.ex, 'ey': self.ey
}
dtype = np.float32
self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
self.rho = np.zeros([self.nx,self.ny]).astype(dtype)
self.ux = np.zeros([self.nx,self.ny]).astype(dtype)
self.uy = np.zeros([self.nx,self.ny]).astype(dtype)
self.ftmp_gpu = cuda.mem_alloc(self.ftmp.nbytes)
self.rho_gpu = cuda.mem_alloc(self.rho.nbytes)
self.ux_gpu = cuda.mem_alloc(self.ux.nbytes)
self.uy_gpu = cuda.mem_alloc(self.uy.nbytes)
def run(self):
src = src_template.render(**self.ctx)
code = SourceModule(src)
initialize = code.get_function('initialize')
self.rho[:,:] = 1.
self.ux[:,:] = 0.
self.uy[:,:] = 0.
cuda.memcpy_htod(self.rho_gpu, self.rho)
cuda.memcpy_htod(self.ux_gpu, self.ux)
cuda.memcpy_htod(self.uy_gpu, self.uy)
initialize(
self.rho_gpu, self.ux_gpu, self.uy_gpu,
self.ftmp_gpu,
block=self.blocksize, grid=self.gridsize
)
if __name__ == "__main__":
sim = channelFlow(64,64); sim.run()
I was able to properly implement a structure of arrays in pycuda using the GPUStruct python module available here and by fixing the improper use of pointers such that:
ftmp->dist${s}[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
was changed to:
float *ftmp${s}_ptr = ftmp->dist${s};
ftmp${s}_ptr[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
The revised code which shows the details of GPUStruct implementation:
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import numpy as np
import pycuda.driver as cuda
import pycuda.tools as tools
import pycuda.autoinit
from gpu_struct import GPUStruct
from mako.template import Template
from pycuda.compiler import SourceModule
src_template = Template(
"""
struct Dist {
%for s in xrange(ns):
float *dist${s};
%endfor
};
// return linear index based on x,y coordinate
__device__ int get_index(int xcoord, int ycoord)
{
return ycoord + xcoord * ${ny};
};
__global__ void initialize(float *rho, float *ux, float *uy, Dist *ftmp)
{
int idx;
float dens, velx, vely, vv, ev;
for (int y = threadIdx.x + blockIdx.x * blockDim.x;
y < ${ny};
y += blockDim.x * gridDim.x)
{
for (int x = threadIdx.y + blockIdx.y * blockDim.y;
x < ${nx};
x += blockDim.y * gridDim.y)
{
if ((x > 0) && (x < ${nx-1}) && (y > 0) && (y < ${ny-1}))
{
idx = get_index(x,y);
dens = rho[idx]; velx = ux[idx]; vely = uy[idx];
vv = velx*velx + vely*vely;
%for s in xrange(ns):
// s = ${s}; \vec{e}[${s}] = [${ex[s]},${ey[s]}]
float *ftmp${s}_ptr1 = ftmp->dist${s};
ev = ${float(ex[s])}f*velx + ${float(ey[s])}f*vely;
ftmp${s}_ptr1[idx] = ${w[s]}f*dens*(1.0f+3.0f*ev+4.5f*ev*ev-1.5f*vv);
%endfor
}
}
}
}
"""
)
class channelFlow:
# initialize channelFlow
def __init__(self, nx, ny):
self.nx, self.ny = nx, ny
max_threads_per_block = tools.DeviceData().max_threads
self.blocksize = (ny if ny<32 else 32, nx if nx<32 else 32, 1) # threads per block
self.gridsize = (ny/self.blocksize[0], nx/self.blocksize[1], 1) # blocks per grid
self.ns = 9
self.w = np.array([4./9, 1./9, 1./9, 1./9, 1./9, 1./36, 1./36, 1./36, 1./36])
self.ex = np.array([0, 1, -1, 0, 0, 1, -1, -1, 1])
self.ey = np.array([0, 0, 0, 1, -1, 1, 1, -1, -1])
self.ctx = { 'nx': self.nx, 'ny': self.ny, 'ns': self.ns,
'w': self.w, 'ex': self.ex, 'ey': self.ey
}
dtype = np.float32
self.ftmp = np.zeros([self.nx,self.ny,self.ns]).astype(dtype)
self.rho = np.zeros([self.nx,self.ny]).astype(dtype)
self.ux = np.zeros([self.nx,self.ny]).astype(dtype)
self.uy = np.zeros([self.nx,self.ny]).astype(dtype)
self.ftmp_gpu = GPUStruct([
(np.float32,'*dist0', self.ftmp[:,:,0]),
(np.float32,'*dist1', self.ftmp[:,:,1]),
(np.float32,'*dist2', self.ftmp[:,:,2]),
(np.float32,'*dist3', self.ftmp[:,:,3]),
(np.float32,'*dist4', self.ftmp[:,:,4]),
(np.float32,'*dist5', self.ftmp[:,:,5]),
(np.float32,'*dist6', self.ftmp[:,:,6]),
(np.float32,'*dist7', self.ftmp[:,:,7]),
(np.float32,'*dist8', self.ftmp[:,:,8])
])
self.rho_gpu = cuda.mem_alloc(self.rho.nbytes)
self.ux_gpu = cuda.mem_alloc(self.ux.nbytes)
self.uy_gpu = cuda.mem_alloc(self.uy.nbytes)
def run(self):
src = src_template.render(**self.ctx)
code = SourceModule(src)
initialize = code.get_function('initialize')
self.rho[:,:] = 1.
self.ux[:,:] = 0.
self.uy[:,:] = 0.
self.ftmp_gpu.copy_to_gpu()
cuda.memcpy_htod(self.rho_gpu, self.rho)
cuda.memcpy_htod(self.ux_gpu, self.ux)
cuda.memcpy_htod(self.uy_gpu, self.uy)
initialize(
self.rho_gpu, self.ux_gpu, self.uy_gpu,
self.ftmp_gpu.get_ptr(),
block=self.blocksize, grid=self.gridsize
)
self.dens = np.zeros_like(self.rho)
cuda.memcpy_dtoh(self.dens, self.rho_gpu)
print self.dens
if __name__ == "__main__":
sim = channelFlow(64,64); sim.run()