Search code examples
tensorflowtensorboardcudnnsemantic-segmentation

Nan in summary histogram for: deconv2/biases


The original size of my images is 3900 x 6000 x 3. I make overlapping patches of shape (232024, 28, 28, 3) and then make batches of size 1000. I have a CNN model for semantic segmentation as follows:

def conv_layer(inputs, filters, kernel_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "conv"):
    with tf.name_scope(name):
        input_shape = inputs.shape.as_list()

        filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], input_shape[3], filters], dtype = tf.float32)

        filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
        bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")

        conv2d = tf.nn.conv2d(input = tf.cast(inputs, dtype = tf.float32), filter = filter, strides = [1, strides, strides, 1], padding = padding)

        activation = tf.nn.relu(conv2d + bias)

        tf.summary.histogram("weights", filter)
        tf.summary.histogram("biases", bias)
        tf.summary.histogram("activations", activation)

        return tf.cast(activation, dtype = tf.float16)

def deconv_layer(inputs, filters, kernel_size, output_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "deconv"):
    with tf.name_scope(name):

        input_shape = inputs.shape.as_list()
        deconv_shape = tf.stack([tf.shape(inputs)[0], output_size[0], output_size[1],filters])

        filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], filters, input_shape[3]], dtype = tf.float32)

        filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
        bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")

        print("bias:")
        print(bias)

        conv2d_transpose = tf.nn.conv2d_transpose(value = tf.cast(inputs, dtype = tf.float32), 
                                                  filter = filter, 
                                                  strides = [1, strides, strides, 1], 
                                                  output_shape=deconv_shape,
                                                  padding = padding)

        activation = tf.nn.relu(conv2d_transpose + bias)

        tf.summary.histogram("weights", filter)
        tf.summary.histogram("biases", bias)
        tf.summary.histogram("activations", activation)

        return tf.cast(activation, dtype = tf.float16)

def semantic_seg_model(features, mode, batch_size):
    bias_constant = 0.1
    conv_filters = [20, 50, 90]
    conv_sizes = []

    tf.summary.image('input', features, batch_size)

    """Model function for CNN."""

    # Encoding starts here.

    # Convolutional Layer 1
    # Input: 100 x 100
    conv = conv_layer(inputs=features,
                        filters=conv_filters[0],
                        kernel_size=[5, 5],
                        bias_constant = bias_constant,
                        name = "conv1")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

    # Convolutional Layer 2
    # Input: 100 x 100
    conv = conv_layer(inputs = conv,
                        filters = conv_filters[1],
                        kernel_size = [5, 5],
                        strides = 2,
                        bias_constant = bias_constant,
                        name = "conv2")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)
    # Convolutional Layer 3
    # Input: 100 x 100
    conv = conv_layer(inputs = conv,
                        filters = conv_filters[2],
                        kernel_size = [5, 5],
                        bias_constant = bias_constant,
                        strides = 2,
                        name = "conv3")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

    # Deconvolution Layer 3
    # Input: 100 x 100
    deconv = deconv_layer(inputs = conv,
                            filters = conv_filters[1],
                            kernel_size = [5, 5],
                            bias_constant = bias_constant,
                            strides = 2,
                            output_size = [conv_sizes[1][1], conv_sizes[1][2]],
                            name = "deconv3")
    print(deconv.shape)
    # Deconvolution Layer 2
    # Input: 100 x 100
    deconv = deconv_layer(inputs = deconv,
                            filters = conv_filters[0],
                            kernel_size = [5, 5],
                            bias_constant = bias_constant,
                            strides = 2,
                            output_size = [conv_sizes[0][1], conv_sizes[0][2]],
                            name = "deconv2")
    print(deconv.shape)
    deconv = deconv_layer(inputs = deconv,
                            filters = 3,
                            kernel_size = [5, 5],
                            output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
                            bias_constant = bias_constant,
                            name = "deconv1")

    print(deconv.shape)
    return deconv

epochs = 1000
learning_rate = 1e-50

image, label = tf.train.slice_input_producer([features, labels], shuffle = False)

BATCH_SIZE = 1000
THREAD_NUM = 5
MIN_AFTER_DEQUEUE = 10000
queue_capacity = MIN_AFTER_DEQUEUE + THREAD_NUM * BATCH_SIZE


image_batch, label_batch = tf.train.batch(tensors = [image, label],
                                            batch_size = BATCH_SIZE,
                                            capacity = queue_capacity,
                                            num_threads = THREAD_NUM,
                                            allow_smaller_final_batch = True)

output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE)

#cost
with tf.name_scope("cross_entropy"):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = output, labels = label_batch)
    cost = tf.reduce_mean( cross_entropy )
#     return cost, optimizer, accr
    tf.summary.scalar("xent", cost)

#optimizer
with tf.name_scope("optimizer"):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)

# Accuracy
with tf.name_scope("accuracy"):
    correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(output, 1))
    accr = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
    tf.summary.scalar("accuracy", accr)

merged_summary = tf.summary.merge_all()

# Session configs
config = tf.ConfigProto()
config.log_device_placement = True
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction=0.8

# Initialize session
sess = tf.Session(config=config)

sess.run(tf.global_variables_initializer())

coord = tf.train.Coordinator()
enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)

try:
    for epoch in range(epochs):
        if coord.should_stop():
            break

        epoch_loss = 0
        train_loss = []; train_accuracy = []

        s = sess.run(merged_summary)
        writer.add_summary(s, epoch)

        for batch in range(math.ceil(features.shape.as_list()[0]/BATCH_SIZE)):
            _, sess_cost, sess_accuracy = sess.run([optimizer, cost, accr])

            train_loss.append(sess_cost)
            train_accuracy.append(sess_accuracy)

        train_loss = np.mean(train_loss)
        train_accuracy = np.mean(train_accuracy)

        saver.save(sess, "./semantic_seg_model_1", global_step=epoch)

        print ("[%02d/%02d] trainLoss: %.4f trainAcc: %.2f" 
           % (epoch + 1, epochs, sess_cost, sess_accuracy))

except Exception as e:
        # Report exceptions to the coordinator.
    coord.request_stop(e)

finally:
        # Terminate as usual. It is safe to call `coord.request_stop()` twice.
    coord.request_stop()
    coord.join(enqueue_threads)

sess.close()

I get an error when I start the training session. The error is as follows:

[01/1000] trainLoss: 0.0000 trainAcc: 1.00

INFO:tensorflow:Error reported to Coordinator: , Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Caused by op 'deconv2/biases', defined at: File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 478, in start self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 888, in start handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File "", line 107, in semantic_seg_model name = "deconv2") File "", line 78, in deconv_layer tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", line 192, in histogram tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 187, in _histogram_summary "HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Number of iterations completed this epoch: 0 --------------------------------------------------------------------------- InvalidArgumentError Traceback (most recent call last) c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1322 try: -> 1323 return fn(*args) 1324 except errors.OpError as e:

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata) 1301 feed_dict, fetch_list, target_list, -> 1302 status, run_metadata) 1303

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\errors_impl.py in exit(self, type_arg, value_arg, traceback_arg) 472 compat.as_text(c_api.TF_Message(self.status.status)), --> 473 c_api.TF_GetCode(self.status.status)) 474 # Delete the underlying status object from memory otherwise it stays alive

InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

During handling of the above exception, another exception occurred:

InvalidArgumentError Traceback (most recent call last) in () 40 # Terminate as usual. It is safe to call coord.request_stop() twice. 41 coord.request_stop() ---> 42 coord.join(enqueue_threads) 43 44 sess.close()

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\training\coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads) 387 self._registered_threads = set() 388 if self._exc_info_to_raise: --> 389 six.reraise(*self._exc_info_to_raise) 390 elif stragglers: 391 if ignore_live_threads:

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\six.py in reraise(tp, value, tb) 691 if value.traceback is not tb: 692 raise value.with_traceback(tb) --> 693 raise value 694 finally: 695 value = None

in () 13 train_loss = []; train_accuracy = [] 14 ---> 15 s = sess.run(merged_summary) 16 writer.add_summary(s, epoch) 17

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata) 887 try: 888 result = self._run(None, fetches, feed_dict, options_ptr, --> 889 run_metadata_ptr) 890 if run_metadata: 891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1118 if final_fetches or final_targets or (handle and feed_dict_tensor): 1119 results = self._do_run(handle, final_targets, final_fetches, -> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1315 if handle is None: 1316 return self._do_call(_run_fn, self._session, feeds, fetches, targets, -> 1317 options, run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1334 except KeyError: 1335 pass -> 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):

InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Caused by op 'deconv2/biases', defined at: File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 478, in start self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 888, in start handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File "", line 107, in semantic_seg_model name = "deconv2") File "", line 78, in deconv_layer tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", line 192, in histogram tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 187, in _histogram_summary "HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Someone at github tensorflow issues suggested trying reduce the learning rate as the model diverged, but that didn't help. Another one suggested that dtype should be changed to float32 from float16 as float16 is problematic. When I change the dtype of data to float32 then I get the following error in the python log console:

[libprotobuf ERROR C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] Exceeded maximum protobuf size of 2GB. [libprotobuf ERROR C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] Exceeded maximum protobuf size of 2GB.

This very same error occurs when I try to increase the width and hieght of the overlapping patches of image. I have also tried reducing BATCH_SIZE but didn't help.

I have 4GB NVIDIA GeForce GTX 960M dedicated graphics card and 16GB RAM with Intel Core i7-6700HQ CPU @ 2.60 GHz 2.60 GHz. Python version is 3.6.4 and Tensorflow version is 1.4 with GPU.

Update 1: Updated model:

def semantic_seg_model(features, mode, batch_size):
    bias_constant = 0.1
    conv_filters = [10, 25, 90]
    conv_sizes = []

    tf.summary.image('input', features, batch_size)

    """Model function for CNN."""

    # Encoding starts here.

    # Convolutional Layer 1
    # Input: 100 x 100
    conv = conv_layer(inputs=features,
                        filters=conv_filters[0],
                        kernel_size=[2, 2],
                        bias_constant = bias_constant,
                        name = "conv1")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

    # Convolutional Layer 2
    # Input: 100 x 100
    conv = conv_layer(inputs = conv,
                        filters = conv_filters[1],
                        kernel_size = [2, 2],
                        bias_constant = bias_constant,
                        name = "conv2")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

# Deconvolution Layer 2
    # Input: 100 x 100
    deconv = deconv_layer(inputs = conv,
                            filters = conv_filters[0],
                            kernel_size = [2, 2],
                            bias_constant = bias_constant,
                            output_size = [conv_sizes[0][1], conv_sizes[0][2]],
                            name = "deconv2")
    print(deconv.shape)
    deconv = deconv_layer(inputs = deconv,
                            filters = 3,
                            kernel_size = [2, 2],
                            output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
                            bias_constant = bias_constant,
                            name = "deconv1")

    print(deconv.shape)
    return tf.cast(deconv, dtype = tf.float16)

Solution

  • I suspect that the problem is that you have significantly overfit; the real evidence here is:

    [01/1000] trainLoss: 0.0000 trainAcc: 1.00
    

    This says that after only one epoch you have perfectly fit to the training data; a sure sign of overfitting. Thus the resulting NaN is probably an unsurprising effect of this problem, since you have now almost certainly have learned weights that will return 0 or inf on data or batches that it hasn't seen (since it is so badly overfit).

    To resolve this issue, I recommend simplifying your model substantially until you get something that doesn't overfit so quickly; for example, fewer and smaller conv and deconv layers. Then you can start to build back in that complexity. You will then also find that you will likely want to build in some dropout and/or batch normalization to deal with this overfitting (note: while it is tempting to just start adding this complexity to your existing model, I recommend against it; get something simple working first, then add complexity from there...).

    Final note: if you simplify the problem as suggested above you will likely have a better minimal example to share; that should let us get to the bottom of your problem more quickly.