Tensorflow: How to take advantage of multi GPUs?

I have a CNN which run well with 1 GPU. Now I move to another computer which has 2 GPUs, I would like to train my network using both GPUs to save time. How could I do it?

I read the https://www.tensorflow.org/tutorials/using_gpu but I think the example was too simple and honestly I don't know how to apply it on my real network.

Could anyone give me a simple illustration on my network please? (I'm doing AutoEncoder).

Thank you very much!

graphCNN = tf.Graph()
with graphCNN.as_default():
    # Input
    x = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="X") # X
    # Output expected
    y_ = tf.placeholder(tf.float32, shape=(None, img_w, img_h,img_ch), name="Y") # Y_    
    # Dropout
    dropout = tf.placeholder(tf.float32)

### Model
    def model(data):
        ### Encoder
        c64 = ConvLayer(data, depth_in=1, depth_out=64, name="c64", kernel_size=3, acti=True)
        c128 = ConvLayer(c64, depth_in=64, depth_out=128, name="c128", kernel_size=3, acti=True)
        c256 = ConvLayer(c128, depth_in=128, depth_out=256, name="c256", kernel_size=3, acti=True)
        c512_1 = ConvLayer(c256, depth_in=256, depth_out=512, name="c512_1", kernel_size=3, acti=True)
        c512_2 = ConvLayer(c512_1, depth_in=512, depth_out=512, name="c512_2", kernel_size=3, acti=True)
        c512_3 = ConvLayer(c512_2, depth_in=512, depth_out=512, name="c512_3", kernel_size=3, acti=True)
        c512_4 = ConvLayer(c512_3, depth_in=512, depth_out=512, name="c512_4", kernel_size=3, acti=True)
        c512_5 = ConvLayer(c512_4, depth_in=512, depth_out=512, name="c512_5", kernel_size=3, acti=True)

        ### Decoder
        dc512_5 = DeconvLayer(c512_5, depth_in=512, depth_out=512, name="dc512_5", kernel_size=3, acti=True)
        dc512_4 = DeconvLayer(dc512_5, depth_in=512, depth_out=512, name="dc512_4", kernel_size=3, acti=True)
        dc512_3 = DeconvLayer(dc512_4, depth_in=512, depth_out=512, name="dc512_3", kernel_size=3, acti=True)
        dc512_2 = DeconvLayer(dc512_3, depth_in=512, depth_out=512, name="dc512_2", kernel_size=3, acti=True)
        dc512_1 = DeconvLayer(dc512_2, depth_in=512, depth_out=512, name="dc512_1", kernel_size=3, acti=True)
        dc256 = DeconvLayer(dc512_1, depth_in=512, depth_out=256, name="dc256", kernel_size=3, acti=True)
        dc128 = DeconvLayer(dc256, depth_in=256, depth_out=128, name="dc128", kernel_size=3, acti=True)
        dc64 = DeconvLayer(dc128, depth_in=128, depth_out=64, name="dc64", kernel_size=3, acti=True)

        output = ConvLayer(dc64, depth_in=64, depth_out=1, name="conv_out", kernel_size=3, acti=True)
        return output
    # Predictions
    y = model(x)
    y_image = tf.reshape(y, [-1, img_w, img_h, 1])
    tf.summary.image('output', y_image, 6)

    #Loss
    loss = tf.reduce_sum(tf.pow(y - y_,2))/(img_w*img_h*img_ch) # MSE
    loss_summary = tf.summary.scalar("Training_Loss", loss)    

    # Optimizer.
    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss)

In case you wanna see more details

def ConvLayer(input, depth_in, depth_out, name="conv", kernel_size=3, acti=True):
with tf.name_scope(name):
    w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_in, depth_out], 
                                        stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")
    conv = tf.nn.conv2d(input, w, strides=[1, 1, 1, 1], padding="SAME")
    tf.summary.histogram("weights", w)
    tf.summary.histogram("biases", b)
    if (acti==True):
        act = tf.nn.relu(conv + b)
        tf.summary.histogram("activations", act)
        result = act
    else:
        result = conv + b

    result_maxpooled = max_pool(result,2)
    return result_maxpooled

def DeconvLayer(input, depth_in, depth_out, name="deconv", kernel_size=3, acti=True):
with tf.name_scope(name):
    w = tf.Variable(tf.truncated_normal([kernel_size, kernel_size, depth_out,depth_in], 
                                        stddev=0.1), name="W")
    b = tf.Variable(tf.constant(0.1, shape=[depth_out]), name="B")


    input_shape = tf.shape(input)
    output_shape = tf.stack([input_shape[0], input_shape[1]*2, input_shape[2]*2, input_shape[3]//2])
    deconv = tf.nn.conv2d_transpose(input, w, output_shape, strides=[1, 1, 1, 1], padding='SAME')

    tf.summary.histogram("weights", w)
    tf.summary.histogram("biases", b)
    if (acti==True):
        act = tf.nn.relu(deconv + b)
        tf.summary.histogram("activations", act)
        result = act
    else:
        result = deconv + b
    return result

Solution

How to implement CNN (Convolutional Neural Network) on Multiple GPUs?

As Quoted from "Training a Model Using Multiple GPU Cards" (Tutorial from Tensorflow)

Place an individual model replica on each GPU.

Update model parameters synchronously by waiting for all GPUs to finish processing a batch of data.

In order to boost performance by understanding dataflow between Main Memory-CPU-GPU have a look at this answer: Why should preprocessing be done on CPU rather than GPU? : https://stackoverflow.com/a/44377741/4190159