I have a CNN which run well with 1 GPU. Now I move to another computer which has 2 GPUs, I would like to train my network using both GPUs to save time. How could I do it?
I read the https://www.tensorflow.org/tutorials/using_gpu but I think the example was too simple and honestly I don't know how to apply it on my real network.
Could anyone give me a simple illustration on my network please? (I'm doing AutoEncoder).
Thank you very much!
graphCNN = tf.Graph()
with graphCNN.as_default():
# Input
x = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="X") # X
# Output expected
y_ = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="Y") # Y_
# Dropout
dropout = tf.placeholder(tf.float32)
### Model
def model(data):
### Encoder
c64 = ConvLayer(data, depth_in=1, depth_out=64, name="c64", kernel_size=3, acti=True)
c128 = ConvLayer(c64, depth_in=64, depth_out=128, name="c128", kernel_size=3, acti=True)
c256 = ConvLayer(c128, depth_in=128, depth_out=256, name="c256", kernel_size=3, acti=True)
c512_1 = ConvLayer(c256, depth_in=256, depth_out=512, name="c512_1", kernel_size=3, acti=True)
c512_2 = ConvLayer(c512_1, depth_in=512, depth_out=512, name="c512_2", kernel_size=3, acti=True)
c512_3 = ConvLayer(c512_2, depth_in=512, depth_out=512, name="c512_3", kernel_size=3, acti=True)
c512_4 = ConvLayer(c512_3, depth_in=512, depth_out=512, name="c512_4", kernel_size=3, acti=True)
c512_5 = ConvLayer(c512_4, depth_in=512, depth_out=512, name="c512_5", kernel_size=3, acti=True)
### Decoder
dc512_5 = DeconvLayer(c512_5, depth_in=512, depth_out=512, name="dc512_5", kernel_size=3, acti=True)
dc512_4 = DeconvLayer(dc512_5, depth_in=512, depth_out=512, name="dc512_4", kernel_size=3, acti=True)
dc512_3 = DeconvLayer(dc512_4, depth_in=512, depth_out=512, name="dc512_3", kernel_size=3, acti=True)
dc512_2 = DeconvLayer(dc512_3, depth_in=512, depth_out=512, name="dc512_2", kernel_size=3, acti=True)
dc512_1 = DeconvLayer(dc512_2, depth_in=512, depth_out=512, name="dc512_1", kernel_size=3, acti=True)
dc256 = DeconvLayer(dc512_1, depth_in=512, depth_out=256, name="dc256", kernel_size=3, acti=True)
dc128 = DeconvLayer(dc256, depth_in=256, depth_out=128, name="dc128", kernel_size=3, acti=True)
dc64 = DeconvLayer(dc128, depth_in=128, depth_out=64, name="dc64", kernel_size=3, acti=True)
output = ConvLayer(dc64, depth_in=64, depth_out=1, name="conv_out", kernel_size=3, acti=True)
return output
# Predictions
y = model(x)
y_image = tf.reshape(y, [-1, img_w, img_h, 1])
tf.summary.image('output', y_image, 6)
#Loss
loss = tf.reduce_sum(tf.pow(y - y_,2))/(img_w*img_h*img_ch) # MSE
loss_summary = tf.summary.scalar("Training_Loss", loss)
# Optimizer.
with tf.name_scope("train"):
train_step = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss)
In case you wanna see more details
def ConvLayer(input, depth_in, depth_out, name="conv", kernel_size=3, acti=True):
with tf.name_scope(name):
w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_in, depth_out],
stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
tf.summary.histogram("weights", w)
tf.summary.histogram("biases", b)
if (acti==True):
act = tf.nn.relu(conv + b)
tf.summary.histogram("activations", act)
result = act
else:
result = conv + b
result_maxpooled = max_pool(result,2)
return result_maxpooled
.
def DeconvLayer(input, depth_in, depth_out, name="deconv", kernel_size=3, acti=True):
with tf.name_scope(name):
w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_out,depth_in],
stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
input_shape = tf.shape(input)
output_shape = tf.stack([input_shape[0], input_shape[1]*2, input_shape[2]*2, input_shape[3]//2])
deconv = tf.nn.conv2d_transpose(input, w, output_shape, strides=[1, 1, 1, 1], padding='SAME')
tf.summary.histogram("weights", w)
tf.summary.histogram("biases", b)
if (acti==True):
act = tf.nn.relu(deconv + b)
tf.summary.histogram("activations", act)
result = act
else:
result = deconv + b
return result
How to implement CNN (Convolutional Neural Network) on Multiple GPUs?
As Quoted from "Training a Model Using Multiple GPU Cards" (Tutorial from Tensorflow)
- Place an individual model replica on each GPU.
- Update model parameters synchronously by waiting for all GPUs to finish processing a batch of data.
In order to boost performance by understanding dataflow between Main Memory-CPU-GPU have a look at this answer: Why should preprocessing be done on CPU rather than GPU? : https://stackoverflow.com/a/44377741/4190159