I'm trying to implement stacked hourglass on tensorflow, while the torch implementation already exist here.
I tested it with a Titan X pascal on default configuration (batch size = 6) and the average training iteration is about 343 ms.
I benched my tensorflow implementation with random input/output:
import tensorflow as tf
class stacked_hourglass():
def __init__(self, nb_stack, name='stacked_hourglass'):
self.nb_stack = nb_stack
self.name = name
def __call__(self, x):
with tf.name_scope(self.name) as scope:
padding = tf.pad(x, [[0,0],[3,3],[3,3],[0,0]], name='padding')
with tf.name_scope("preprocessing") as sc:
conv1 = self._conv(padding, 64, 7, 2, 'VALID', 'conv1')
norm1 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
r1 = self._residual_block(norm1, 128, 'r1')
pool = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], 'VALID', scope=scope)
r2 = self._residual_block(pool, 128, 'r2')
r3 = self._residual_block(r2, 256, 'r3')
hg = [None] * self.nb_stack
ll = [None] * self.nb_stack
ll_ = [None] * self.nb_stack
out = [None] * self.nb_stack
out_ = [None] * self.nb_stack
sum_ = [None] * self.nb_stack
with tf.name_scope('_hourglass_0_with_supervision') as sc:
hg[0] = self._hourglass(r3, 4, 256, '_hourglass')
ll[0] = self._conv_bn_relu(hg[0], 256, name='conv_1')
ll_[0] = self._conv(ll[0],256,1,1,'VALID','ll')
out[0] = self._conv(ll[0],16,1,1,'VALID','out')
out_[0] = self._conv(out[0],256,1,1,'VALID','out_')
sum_[0] = tf.add_n([ll_[0], out_[0], r3])
for i in range(1, self.nb_stack - 1):
with tf.name_scope('_hourglass_' + str(i) + '_with_supervision') as sc:
hg[i] = self._hourglass(sum_[i-1], 4, 256, '_hourglass')
ll[i] = self._conv_bn_relu(hg[i], 256, name='conv_1')
ll_[i] = self._conv(ll[i],256,1,1,'VALID','ll')
out[i] = self._conv(ll[i],16,1,1,'VALID','out')
out_[i] = self._conv(out[i],256,1,1,'VALID','out_')
sum_[i] = tf.add_n([ll_[i], out_[i], sum_[i-1]])
with tf.name_scope('_hourglass_' + str(self.nb_stack - 1) + '_with_supervision') as sc:
hg[self.nb_stack-1] = self._hourglass(sum_[self.nb_stack - 2], 4, 256, '_hourglass')
ll[self.nb_stack-1] = self._conv_bn_relu(hg[self.nb_stack - 1], 256, name='conv_1')
out[self.nb_stack-1] = self._conv(ll[self.nb_stack-1],16,1,1,'VALID','out')
return tf.stack(out)
def _conv(self, inputs, nb_filter, kernel_size=1, strides=1, pad='VALID', name='conv'):
with tf.name_scope(name) as scope:
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
return conv
def _conv_bn_relu(self, inputs, nb_filter, kernel_size=1, strides=1, name=None):
with tf.name_scope(name) as scope:
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='SAME', data_format='NHWC')
norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn=tf.nn.relu, scope=scope)
return norm
def _conv_block(self, inputs, nb_filter_out, name='_conv_block'):
with tf.name_scope(name) as scope:
with tf.name_scope('norm_conv1') as sc:
norm1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv1 = self._conv(norm1, nb_filter_out / 2, 1, 1, 'SAME', name='conv1')
with tf.name_scope('norm_conv2') as sc:
norm2 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv2 = self._conv(norm2, nb_filter_out / 2, 3, 1, 'SAME', name='conv2')
with tf.name_scope('norm_conv3') as sc:
norm3 = tf.contrib.layers.batch_norm(conv2, 0.9, epsilon=1e-5,
activation_fn=tf.nn.relu, scope=sc)
conv3 = self._conv(norm3, nb_filter_out, 1, 1, 'SAME', name='conv3')
return conv3
def _skip_layer(self, inputs, nb_filter_out, name='_skip_layer'):
if inputs.get_shape()[3].__eq__(tf.Dimension(nb_filter_out)):
return inputs
else:
with tf.name_scope(name) as scope:
conv = self._conv(inputs, nb_filter_out, 1, 1, 'SAME', name='conv')
return conv
def _residual_block(self, inputs, nb_filter_out, name='_residual_block'):
with tf.name_scope(name) as scope:
_conv_block = self._conv_block(inputs, nb_filter_out)
_skip_layer = self._skip_layer(inputs, nb_filter_out)
return tf.add(_skip_layer, _conv_block)
def _hourglass(self, inputs, n, nb_filter_res, name='_hourglass'):
with tf.name_scope(name) as scope:
# Upper branch
up1 = self._residual_block(inputs, nb_filter_res, 'up1')
# Lower branch
pool = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], 'VALID', scope=scope)
low1 = self._residual_block(pool, nb_filter_res, 'low1')
if n > 1:
low2 = self._hourglass(low1, n-1, nb_filter_res, 'low2')
else:
low2 = self._residual_block(low1, nb_filter_res, 'low2')
low3 = self._residual_block(low2, nb_filter_res, 'low3')
low4 = tf.image.resize_nearest_neighbor(low3, tf.shape(low3)[1:3] * 2,
name='upsampling')
if n < 4:
return tf.add(up1, low4, name='merge')
else:
return self._residual_block(tf.add(up1, low4), nb_filter_res, 'low4')
if __name__ == "__main__":
import os
import sys
import numpy as np
import time
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
with tf.Graph().as_default():
DEVICE = '/gpu:0'
with tf.device(DEVICE):
print "start build model..."
_x = tf.placeholder(tf.float32, [None, 256, 256, 3])
y = tf.placeholder(tf.float32, [8, None, 64, 64, 16])
output = stacked_hourglass(8, 'stacked_hourglass')(_x)
loss = tf.reduce_mean(tf.square(output - y))
rmsprop = tf.train.RMSPropOptimizer(2.5e-4)
print "build finished..."
train_step = tf.Variable(0, name='global_step', trainable=False)
with tf.device(DEVICE):
train_rmsprop = rmsprop.minimize(loss, train_step)
init = tf.global_variables_initializer()
with tf.Session() as sess:
with tf.device(DEVICE):
sess.run(init)
print "test..."
xarr = np.random.rand(100, 6, 256, 256, 3)
yarr = np.random.rand(100, 8, 6, 64, 64, 16)
_time = time.clock()
with tf.device(DEVICE):
for u in range(0, 100):
sess.run(train_rmsprop, feed_dict={_x:xarr[u], y:yarr[u]})
print "test:", time.clock() - _time
The output is:
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so
.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so.
5 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so.
8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1
locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so
.8.0 locally
start build model...
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "NegTrain" device_type: "CPU"')
for unknown op: NegTrain
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "Skipgram" device_type: "CPU"')
for unknown op: Skipgram
build finished...
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties:
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:05:00.0
Total memory: 11.90GiB
Free memory: 11.75GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -
> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:05:00.0)
test...
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=2609 evicted_count=1000 eviction_rate=0.383289 and unsatisfied allocation ra
te=0.667841
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100
to 110
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2013 evicted_count=2000 eviction_rate=0.993542 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=4719 evicted_count=3000 eviction_rate=0.635728 and unsatisfied allocation ra
te=0.625358
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 193
to 212
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2025 evicted_count=2000 eviction_rate=0.987654 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1037 evicted_count=1000 eviction_rate=0.96432 and unsatisfied allocation rate=0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1054 evicted_count=1000 eviction_rate=0.948767 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1079 evicted_count=1000 eviction_rate=0.926784 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=5036 evicted_count=2000 eviction_rate=0.397141 and unsatisfied allocation ra
te=0.359674
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 140
0 to 1540
test: 71.733044
meaning that the average iteration is about 717 ms which is twice as slow as the torch implementation...
I know Tensorflow is supposed to be slightly slower but lots of works have being done to catch up (is supposed to be quite close now given some benchmarks)
Do you know what make my implementation that slow ?
How does forward step timing compare? TensorFlow was historically slower on backprop than Torch because AD runs on graphs with higher granularity (individual math ops as opposed to Torch layers), so there are more ops generated for backward pass. This was mitigated in some cases by adding fused version of important ops/their gradients.
Some ideas
Make sure you are using tf.fused_batch_norm
under the covers (ie fused=True
parameter here)
Use queues instead of feed_dict
. That feed_dict
incurs an extra copy from Python runtime to TensorFlow runtime, so you are actually doing 2 copies -- Python->TensorFlow CPU, TensorFlow CPU->TensorFlow GPU. For an extra step to absorb CPU->GPU transfer latency, there's this
Looking at timelines can tell you what part is being too slow.
tcmalloc and c protobufs
sudo apt-get install google-perftools
export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"
pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.0.0-cp27-none-linux_x86_64.whl