I have some complicated formula, which is easier to implement directly using CUDA code. On the other hand, I need to make use of the theano feature to build a neural network and train it separately.
How can I safely use pycuda and theano together?
The following code works on my machine:
import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray
import theano
import theano.tensor as T
def get_pycuda_func():
mod = cudacc.SourceModule("""
__global__ void mul(double *dest, double *a, double *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
mul = mod.get_function("mul")
mul.prepare("PPP")
def f(a,b):
N = len(a)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
c = gpuarray.empty((N,),dtype=np.float64)
mul.prepared_call(
(1,1,1),(N,1,1),
c.gpudata,
gpu_a.gpudata,
gpu_b.gpudata
)
return c.get()
return f
def get_theano_func():
a = T.vector('a')
b = T.vector('b')
c = a*b
f = theano.function([a,b],c,allow_input_downcast=True)
return f
def get_cpu_func():
def f(a,b):
return a*b
return f
if __name__ == "__main__":
np.random.seed(12345)
a = np.random.randn(400)
b = np.random.randn(400)
f_cuda = get_pycuda_func()
f_cpu = get_cpu_func()
f_theano = get_theano_func()
for k in range(10):
x = f_cuda(a,b)
y = f_theano(a,b)
z = f_cpu(a,b)
print(k)
print(np.allclose(x,z))
print(np.allclose(y,z))
Output:
$ python3 test_theano_pycuda_simpler.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
0
True
True
1
True
True
2
True
True
3
True
True
4
True
True
5
True
True
6
True
True
7
True
True
8
True
True
9
True
True
But if I make a more complicated theano computation, it does not work. The following DOES NOT WORK:
import numpy as np
import pycuda.autoinit as cuauto
import pycuda.driver as cuda
import pycuda.compiler as cudacc
import pycuda.gpuarray as gpuarray
import theano
import theano.tensor as T
def get_pycuda_func():
mod = cudacc.SourceModule("""
__global__ void mul(double *dest, double *a, double *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
mul = mod.get_function("mul")
mul.prepare("PPP")
def f(a,b):
N = len(a)
gpu_a = gpuarray.to_gpu(a)
gpu_b = gpuarray.to_gpu(b)
c = gpuarray.empty((N,),dtype=np.float64)
mul.prepared_call(
(1,1,1),(N,1,1),
c.gpudata,
gpu_a.gpudata,
gpu_b.gpudata
)
return c.get()
return f
floatX=theano.config.floatX
def init_bias(size):
tmp = np.random.rand(size)
return theano.shared(np.asarray(tmp,dtype=floatX))
def init_weights(in_size,out_size):
s = np.sqrt(2./(in_size+out_size))
tmp = np.random.normal(loc=0.,scale=s,size=(in_size,out_size))
return theano.shared(np.asarray(tmp,dtype=floatX))
def adam(params, gparams,learning_rate = 0.0001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
updates = []
t_pre = theano.shared(np.asarray(.0, dtype=theano.config.floatX))
t = t_pre + 1
a_t = learning_rate * T.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
for (p,g) in zip(params, gparams):
v = p.get_value(borrow = True)
m_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
v_pre = theano.shared(np.zeros(v.shape, dtype = v.dtype), broadcastable = p.broadcastable)
m_t = beta1 * m_pre + (1 - beta1) * g
v_t = beta2 * v_pre + (1 - beta2) * g ** 2
step = a_t * m_t / (T.sqrt(v_t) + epsilon)
p_update = p - step
updates.append((m_pre, m_t))
updates.append((v_pre, v_t))
updates.append((p, p_update))
updates.append((t_pre, t))
return updates
class test_network:
def __init__(self,hidden=[100,100]):
self.hidden = hidden
self._create_params()
self._create_train_func()
self._create_func()
def _create_params(self):
hidden = self.hidden
W0 = init_weights(1,hidden[0])
W1 = init_weights(hidden[0],hidden[1])
W2 = init_weights(hidden[1],1)
b0 = init_bias(hidden[0])
b1 = init_bias(hidden[1])
b2 = init_bias(1)
self.params = [
W0,W1,W2,
b0,b1,b2,
]
def predict(self,x):
[
W0,W1,W2,
b0,b1,b2,
] = self.params
H0 = T.dot(x,W0) + b0
H0 = T.nnet.relu(H0)
H1 = T.dot(H0,W1) + b1
H1 = T.nnet.relu(H1)
ret = T.dot(H1,W2) + b2
return ret
def _create_func(self):
x = T.matrix('x')
y = self.predict(x)
self.f = theano.function([x],y,allow_input_downcast=True)
def _create_train_func(self):
y_in = T.matrix('y_in')
x = T.matrix('x')
y = self.predict(x)
loss = T.mean((y-y_in)*(y-y_in))
grad_loss = T.grad(loss,self.params)
updates = adam(self.params,grad_loss)
self.train = theano.function(inputs=[x,y_in],
outputs=loss,
updates=updates,
allow_input_downcast=True)
def get_cpu_func():
def f(a,b):
return a*b
return f
if __name__ == "__main__":
np.random.seed(12345)
a = np.random.randn(400)
b = np.random.randn(400)
f_cuda = get_pycuda_func()
f_cpu = get_cpu_func()
T = test_network()
for k in range(10):
x = f_cuda(a,b)
z = f_cpu(a,b)
print(k)
print(np.allclose(x,z))
batch_size = 256
for k in range(1000):
x = np.random.rand(batch_size)
y = x*x
x = x.reshape(batch_size,1)
y = y.reshape(batch_size,1)
loss = T.train(x,y)
print("k=%d, loss=%g" % (k,loss))
I would get:
$ python3 test_theano_pycuda.py
Using cuDNN version 7003 on context None
Mapped name None to device cuda: GeForce GTX TITAN Black (0000:01:00.0)
Traceback (most recent call last):
File "test_theano_pycuda.py", line 160, in <module>
x = f_cuda(a,b)
File "test_theano_pycuda.py", line 32, in f
gpu_b.gpudata
File "/usr/local/lib/python3.5/dist-packages/pycuda-2017.1.1-py3.5-linux-x86_64.egg/pycuda/driver.py", line 447, in function_prepared_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
I am sure my test_theano_pycuda.py works because I have tested it by forcing theano to use CPU instead of cuda. (By modifying ~/.theanorc):
From this. I bet it should be related to the problem that pycuda and theano are both creating a context within one process.
In theano document,
with gpuarray_cuda_context:
pycuda_context = pycuda.driver.Context.attach()
where does that gpuarray_cuda_context
come from? Are there any workable example that I can test with?
gpuarray_cuda_context
here is just an existing context from a GpuArray Variable.
For instance, you can find an example in theano/gpuarray/fft.py
, where I think skcuda.misc.init()
will call pycuda.driver.Context.attach()
or do something similar.