PyCuda multiple threads execution error

This is my pycuda code.When I execute I get threading errors.I am trying to implement pycuda threads in this.I have searched google but couldn't find an answer.Respurces are also very limited for pycuda I'm the Internet.So guys please help me out.

{{{#!python
import pycuda
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import threading
import numpy

class GPUThread(threading.Thread):
    def __init__(self, number, some_array):
        threading.Thread.__init__(self)

        self.number = number
        self.some_array = some_array

    def run(self):
        self.dev = cuda.Device(self.number)
        self.ctx = self.dev.make_context()

        self.array_gpu = cuda.mem_alloc(some_array.nbytes)
        cuda.memcpy_htod(self.array_gpu, some_array)

        test_kernel(self.array_gpu)
        print "successful exit from thread %d" % self.number
        self.ctx.pop()

        del self.array_gpu
        del self.ctx

def test_kernel(input_array_gpu):
    mod = SourceModule("""
        __global__ void f(float * out, float * in)
        {
            int idx = threadIdx.x;
            out[idx] = in[idx] + 6;
        }
        """)
    func = mod.get_function("f")

    output_array = numpy.zeros((1,512))
    output_array_gpu = cuda.mem_alloc(output_array.nbytes)

    func(output_array_gpu,
          input_array_gpu,
          block=(512,1,1))
    cuda.memcpy_dtoh(output_array, output_array_gpu)

    return output_array

cuda.init()
some_array = numpy.ones((1,512), dtype=numpy.float32)
num = cuda.Device.count()

gpu_thread_list = []
for i in range(num):
    gpu_thread = GPUThread(i, some_array)
    gpu_thread.start()


}}}

Solution

You forgot to append the gpu thread.Just add gpu_thread_list.append(gpu_thread) after start() in your code and it will work.There are quite a lot of good tutorial for pycuda online.Check this.