Search code examples
theanopycudatheano-cuda

Using pycuda and theano together: pycuda._driver.LogicError


I have some complicated formula, which is easier to implement directly using CUDA code. On the other hand, I need to make use of the theano feature to build a neural network and train it separately.

How can I safely use pycuda and theano together?

The following code works on my machine:

import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray

import theano
import theano.tensor as T

def get_pycuda_func():
    mod = cudacc.SourceModule("""
    __global__ void mul(double *dest, double *a, double *b)
    {
      const int i = threadIdx.x;
      dest[i] = a[i] * b[i];
    }
    """)

    mul = mod.get_function("mul")

    mul.prepare("PPP")

    def f(a,b):
        N = len(a)
        gpu_a = gpuarray.to_gpu(a)
        gpu_b = gpuarray.to_gpu(b)
        c = gpuarray.empty((N,),dtype=np.float64)
        mul.prepared_call(
            (1,1,1),(N,1,1),
            c.gpudata,
            gpu_a.gpudata,
            gpu_b.gpudata
        )
        return c.get()
    return f

def get_theano_func():
    a = T.vector('a')
    b = T.vector('b')
    c = a*b
    f = theano.function([a,b],c,allow_input_downcast=True)
    return f

def get_cpu_func():
    def f(a,b):
        return a*b
    return f

if __name__ == "__main__":
    np.random.seed(12345)

    a = np.random.randn(400)
    b = np.random.randn(400)

    f_cuda = get_pycuda_func()
    f_cpu = get_cpu_func()
    f_theano = get_theano_func()

    for k in range(10):
        x = f_cuda(a,b)
        y = f_theano(a,b)
        z = f_cpu(a,b)
        print(k)
        print(np.allclose(x,z))
        print(np.allclose(y,z))

Output:

$ python3 test_theano_pycuda_simpler.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
0
True
True
1
True
True
2
True
True
3
True
True
4
True
True
5
True
True
6
True
True
7
True
True
8
True
True
9
True
True

But if I make a more complicated theano computation, it does not work. The following DOES NOT WORK:

import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray

import theano
import theano.tensor as T

def get_pycuda_func():
    mod = cudacc.SourceModule("""
    __global__ void mul(double *dest, double *a, double *b)
    {
      const int i = threadIdx.x;
      dest[i] = a[i] * b[i];
    }
    """)

    mul = mod.get_function("mul")

    mul.prepare("PPP")

    def f(a,b):
        N = len(a)
        gpu_a = gpuarray.to_gpu(a)
        gpu_b = gpuarray.to_gpu(b)
        c = gpuarray.empty((N,),dtype=np.float64)
        mul.prepared_call(
            (1,1,1),(N,1,1),
            c.gpudata,
            gpu_a.gpudata,
            gpu_b.gpudata
        )
        return c.get()
    return f

floatX=theano.config.floatX
def init_bias(size):
    tmp = np.random.rand(size)
    return theano.shared(np.asarray(tmp,dtype=floatX))

def init_weights(in_size,out_size):
    s = np.sqrt(2./(in_size+out_size))
    tmp = np.random.normal(loc=0.,scale=s,size=(in_size,out_size))
    return theano.shared(np.asarray(tmp,dtype=floatX))

def adam(params, gparams,learning_rate = 0.0001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
    updates = []
    t_pre = theano.shared(np.asarray(.0, dtype=theano.config.floatX))
    t = t_pre + 1
    a_t = learning_rate * T.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)

    for (p,g) in zip(params, gparams):
        v = p.get_value(borrow = True)
        m_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
        v_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)

        m_t = beta1 * m_pre + (1 - beta1) * g
        v_t = beta2 * v_pre + (1 - beta2) * g ** 2
        step = a_t * m_t / (T.sqrt(v_t) + epsilon)

        p_update = p - step
        updates.append((m_pre, m_t))
        updates.append((v_pre, v_t))
        updates.append((p, p_update))

    updates.append((t_pre, t))
    return updates

class test_network:
    def __init__(self,hidden=[100,100]):
        self.hidden = hidden
        self._create_params()
        self._create_train_func()
        self._create_func()

    def _create_params(self):
        hidden = self.hidden

        W0 = init_weights(1,hidden[0])
        W1 = init_weights(hidden[0],hidden[1])
        W2 = init_weights(hidden[1],1)
        b0 = init_bias(hidden[0])
        b1 = init_bias(hidden[1])
        b2 = init_bias(1)

        self.params = [
            W0,W1,W2,
            b0,b1,b2,
        ]

    def predict(self,x):
        [
            W0,W1,W2,
            b0,b1,b2,
        ] = self.params

        H0 = T.dot(x,W0) + b0
        H0 = T.nnet.relu(H0)
        H1 = T.dot(H0,W1) + b1
        H1 = T.nnet.relu(H1)

        ret = T.dot(H1,W2) + b2
        return ret

    def _create_func(self):
        x = T.matrix('x')
        y = self.predict(x)
        self.f = theano.function([x],y,allow_input_downcast=True)

    def _create_train_func(self):
        y_in = T.matrix('y_in')
        x = T.matrix('x')
        y = self.predict(x)

        loss = T.mean((y-y_in)*(y-y_in))

        grad_loss = T.grad(loss,self.params)

        updates = adam(self.params,grad_loss)
        self.train = theano.function(inputs=[x,y_in],
                                     outputs=loss,
                                     updates=updates,
                                     allow_input_downcast=True)

def get_cpu_func():
    def f(a,b):
        return a*b
    return f

if __name__ == "__main__":
    np.random.seed(12345)

    a = np.random.randn(400)
    b = np.random.randn(400)

    f_cuda = get_pycuda_func()
    f_cpu = get_cpu_func()
    T = test_network()

    for k in range(10):
        x = f_cuda(a,b)
        z = f_cpu(a,b)
        print(k)
        print(np.allclose(x,z))
    batch_size = 256
    for k in range(1000):
        x = np.random.rand(batch_size)
        y = x*x
        x = x.reshape(batch_size,1)
        y = y.reshape(batch_size,1)
        loss = T.train(x,y)
        print("k=%d, loss=%g" % (k,loss))

I would get:

$ python3 test_theano_pycuda.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
Traceback (most recent call last):
  File "test_theano_pycuda.py", line 160, in <module>
    x = f_cuda(a,b)
  File "test_theano_pycuda.py", line 32, in f
    gpu_b.gpudata
  File "/usr/local/lib/python3.5/dist-packages/pycuda-2017.1.1-py3.5-linux-x86_64.egg/pycuda/driver.py", line 447, in function_prepared_call
    func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle

I am sure my test_theano_pycuda.py works because I have tested it by forcing theano to use CPU instead of cuda. (By modifying ~/.theanorc):

From this. I bet it should be related to the problem that pycuda and theano are both creating a context within one process.

In theano document,

with gpuarray_cuda_context:
    pycuda_context = pycuda.driver.Context.attach()

where does that gpuarray_cuda_context come from? Are there any workable example that I can test with?


Solution

  • gpuarray_cuda_context here is just an existing context from a GpuArray Variable.

    For instance, you can find an example in theano/gpuarray/fft.py, where I think skcuda.misc.init() will call pycuda.driver.Context.attach() or do something similar.