tensorflow implementation is 2x slower than the torch's one

I'm trying to implement stacked hourglass on tensorflow, while the torch implementation already exist here.

I tested it with a Titan X pascal on default configuration (batch size = 6) and the average training iteration is about 343 ms.

I benched my tensorflow implementation with random input/output:

import tensorflow as tf

class stacked_hourglass():
    def __init__(self, nb_stack, name='stacked_hourglass'):
        self.nb_stack = nb_stack
        self.name = name

    def __call__(self, x):
        with tf.name_scope(self.name) as scope:
            padding = tf.pad(x, [[0,0],[3,3],[3,3],[0,0]], name='padding')
            with tf.name_scope("preprocessing") as sc:
                conv1 = self._conv(padding, 64, 7, 2, 'VALID', 'conv1')
                norm1 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                r1 = self._residual_block(norm1, 128, 'r1')
                pool = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], 'VALID', scope=scope)
                r2 = self._residual_block(pool, 128, 'r2')
                r3 = self._residual_block(r2, 256, 'r3')
            hg = [None] * self.nb_stack
            ll = [None] * self.nb_stack
            ll_ = [None] * self.nb_stack
            out = [None] * self.nb_stack
            out_ = [None] * self.nb_stack
            sum_ = [None] * self.nb_stack
            with tf.name_scope('_hourglass_0_with_supervision') as sc:
                hg[0] = self._hourglass(r3, 4, 256, '_hourglass')
                ll[0] = self._conv_bn_relu(hg[0], 256, name='conv_1')
                ll_[0] = self._conv(ll[0],256,1,1,'VALID','ll')
                out[0] = self._conv(ll[0],16,1,1,'VALID','out')
                out_[0] = self._conv(out[0],256,1,1,'VALID','out_')
                sum_[0] = tf.add_n([ll_[0], out_[0], r3])
            for i in range(1, self.nb_stack - 1):
                with tf.name_scope('_hourglass_' + str(i) + '_with_supervision') as sc:
                    hg[i] = self._hourglass(sum_[i-1], 4, 256, '_hourglass')
                    ll[i] = self._conv_bn_relu(hg[i], 256, name='conv_1')
                    ll_[i] = self._conv(ll[i],256,1,1,'VALID','ll')
                    out[i] = self._conv(ll[i],16,1,1,'VALID','out')
                    out_[i] = self._conv(out[i],256,1,1,'VALID','out_')
                    sum_[i] = tf.add_n([ll_[i], out_[i], sum_[i-1]])
            with tf.name_scope('_hourglass_' + str(self.nb_stack - 1) + '_with_supervision') as sc:
                hg[self.nb_stack-1] = self._hourglass(sum_[self.nb_stack - 2], 4, 256, '_hourglass')
                ll[self.nb_stack-1] = self._conv_bn_relu(hg[self.nb_stack - 1], 256, name='conv_1')
                out[self.nb_stack-1] = self._conv(ll[self.nb_stack-1],16,1,1,'VALID','out')
            return tf.stack(out)

    def _conv(self, inputs, nb_filter, kernel_size=1, strides=1, pad='VALID', name='conv'):
        with tf.name_scope(name) as scope:
            kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
                                    kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
            conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
            return conv

    def _conv_bn_relu(self, inputs, nb_filter, kernel_size=1, strides=1, name=None):
         with tf.name_scope(name) as scope:
            kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
                                    kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
            conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='SAME', data_format='NHWC')
            norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn=tf.nn.relu, scope=scope)
            return norm

    def _conv_block(self, inputs, nb_filter_out, name='_conv_block'):
        with tf.name_scope(name) as scope:
            with tf.name_scope('norm_conv1') as sc:
                norm1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv1 = self._conv(norm1, nb_filter_out / 2, 1, 1, 'SAME', name='conv1')
            with tf.name_scope('norm_conv2') as sc:
                norm2 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv2 = self._conv(norm2, nb_filter_out / 2, 3, 1, 'SAME', name='conv2')
            with tf.name_scope('norm_conv3') as sc:
                norm3 = tf.contrib.layers.batch_norm(conv2, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv3 = self._conv(norm3, nb_filter_out, 1, 1, 'SAME', name='conv3')
            return conv3

    def _skip_layer(self, inputs, nb_filter_out, name='_skip_layer'):
        if inputs.get_shape()[3].__eq__(tf.Dimension(nb_filter_out)):
            return inputs
        else:
            with tf.name_scope(name) as scope:
                conv = self._conv(inputs, nb_filter_out, 1, 1, 'SAME', name='conv')
                return conv

    def _residual_block(self, inputs, nb_filter_out, name='_residual_block'):
        with tf.name_scope(name) as scope:
            _conv_block = self._conv_block(inputs, nb_filter_out)
            _skip_layer = self._skip_layer(inputs, nb_filter_out)
            return tf.add(_skip_layer, _conv_block)

    def _hourglass(self, inputs, n, nb_filter_res, name='_hourglass'):
        with tf.name_scope(name) as scope:
            # Upper branch
            up1 = self._residual_block(inputs, nb_filter_res, 'up1')
            # Lower branch
            pool = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], 'VALID', scope=scope)
            low1 = self._residual_block(pool, nb_filter_res, 'low1')
            if n > 1:
                low2 = self._hourglass(low1, n-1, nb_filter_res, 'low2')
            else:
                low2 = self._residual_block(low1, nb_filter_res, 'low2')
            low3 = self._residual_block(low2, nb_filter_res, 'low3')
            low4 = tf.image.resize_nearest_neighbor(low3, tf.shape(low3)[1:3] * 2,
                                                    name='upsampling')
            if n < 4:
                return tf.add(up1, low4, name='merge')
            else:
                return self._residual_block(tf.add(up1, low4), nb_filter_res, 'low4')

if __name__ == "__main__":
    import os
    import sys
    import numpy as np
    import time
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) 
    with tf.Graph().as_default():
        DEVICE = '/gpu:0'
        with tf.device(DEVICE):
            print "start build model..."
            _x = tf.placeholder(tf.float32, [None, 256, 256, 3])
            y = tf.placeholder(tf.float32, [8, None, 64, 64, 16])
            output = stacked_hourglass(8, 'stacked_hourglass')(_x)
            loss = tf.reduce_mean(tf.square(output - y))
            rmsprop = tf.train.RMSPropOptimizer(2.5e-4)
            print "build finished..."
        train_step = tf.Variable(0, name='global_step', trainable=False)
        with tf.device(DEVICE):
            train_rmsprop = rmsprop.minimize(loss, train_step)
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            with tf.device(DEVICE):
                sess.run(init)
            print "test..."
            xarr = np.random.rand(100, 6, 256, 256, 3)
            yarr = np.random.rand(100, 8, 6, 64, 64, 16)
            _time = time.clock()
            with tf.device(DEVICE):
                for u in range(0, 100):
                    sess.run(train_rmsprop, feed_dict={_x:xarr[u], y:yarr[u]})
            print "test:", time.clock() - _time

The output is:

I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so
.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so.
5 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so.
8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1
 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so
.8.0 locally
start build model...
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "NegTrain" device_type: "CPU"') 
for unknown op: NegTrain
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "Skipgram" device_type: "CPU"') 
for unknown op: Skipgram
build finished...
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:05:00.0
Total memory: 11.90GiB
Free memory: 11.75GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -
> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:05:00.0)
test...
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=2609 evicted_count=1000 eviction_rate=0.383289 and unsatisfied allocation ra
te=0.667841
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100
 to 110
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2013 evicted_count=2000 eviction_rate=0.993542 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=4719 evicted_count=3000 eviction_rate=0.635728 and unsatisfied allocation ra
te=0.625358
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 193
 to 212
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2025 evicted_count=2000 eviction_rate=0.987654 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1037 evicted_count=1000 eviction_rate=0.96432 and unsatisfied allocation rate=0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1054 evicted_count=1000 eviction_rate=0.948767 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1079 evicted_count=1000 eviction_rate=0.926784 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=5036 evicted_count=2000 eviction_rate=0.397141 and unsatisfied allocation ra
te=0.359674
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 140
0 to 1540
test: 71.733044

meaning that the average iteration is about 717 ms which is twice as slow as the torch implementation...

I know Tensorflow is supposed to be slightly slower but lots of works have being done to catch up (is supposed to be quite close now given some benchmarks)

Do you know what make my implementation that slow ?

Solution

How does forward step timing compare? TensorFlow was historically slower on backprop than Torch because AD runs on graphs with higher granularity (individual math ops as opposed to Torch layers), so there are more ops generated for backward pass. This was mitigated in some cases by adding fused version of important ops/their gradients.

Some ideas

Make sure you are using tf.fused_batch_norm under the covers (ie fused=True parameter here)
Use queues instead of feed_dict. That feed_dict incurs an extra copy from Python runtime to TensorFlow runtime, so you are actually doing 2 copies -- Python->TensorFlow CPU, TensorFlow CPU->TensorFlow GPU. For an extra step to absorb CPU->GPU transfer latency, there's this
Looking at timelines can tell you what part is being too slow.

tcmalloc and c protobufs

sudo apt-get install google-perftools
export LD_PRELOAD="/usr/lib/libtcmalloc.so.4" 
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.0.0-cp27-none-linux_x86_64.whl