Search code examples

training xception model keras - batch size 32 gives error but it works for batch size = 16

training xception model keras - batch size 32 gives error but it works for batch size = 16
below is the details of the error log can you please help me i am guessing something below is the crux of the error but not able to figure it out OOM when allocating tensor with shape[728,728,1,1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc

   ResourceExhaustedError                    Traceback (most recent call last)
    Cell In[34], line 7
          2 model_save =  ModelCheckpoint('/kaggle/working/model_weights.keras' , monitor = 'val_loss', save_best_only = True, mode = 'min')
          3 reduce_lr =  ReduceLROnPlateau(monitor='val_loss', factor=0.1,
          4                               patience=4, min_lr=0.0001)
    ----> 7 history =, steps_per_epoch= steps_per_epoch, validation_data=val_it,
          8              validation_steps=validation_steps, epochs = epochs, callbacks=[early_stopping, model_save, reduce_lr] )
    File /opt/conda/lib/python3.10/site-packages/keras/utils/, in filter_traceback.<locals>.error_handler(*args, **kwargs)
         67     filtered_tb = _process_traceback_frames(e.__traceback__)
         68     # To get the full stack trace, call:
         69     # `tf.debugging.disable_traceback_filtering()`
    ---> 70     raise e.with_traceback(filtered_tb) from None
         71 finally:
         72     del filtered_tb
    File /opt/conda/lib/python3.10/site-packages/tensorflow/python/eager/, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
         50 try:
         51   ctx.ensure_initialized()
    ---> 52   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
         53                                       inputs, attrs, num_outputs)
         54 except core._NotOkStatusException as e:
         55   if name is not None:
    ResourceExhaustedError: Graph execution error:
    Detected at node 'model_1/block6_sepconv2/separable_conv2d' defined at (most recent call last):
        File "/opt/conda/lib/python3.10/", line 196, in _run_module_as_main
          return _run_code(code, main_globals, None,
        File "/opt/conda/lib/python3.10/", line 86, in _run_code
          exec(code, run_globals)
        File "/opt/conda/lib/python3.10/site-packages/", line 17, in <module>
        File "/opt/conda/lib/python3.10/site-packages/traitlets/config/", line 1043, in launch_instance
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 728, in start
        File "/opt/conda/lib/python3.10/site-packages/tornado/platform/", line 195, in start
        File "/opt/conda/lib/python3.10/asyncio/", line 603, in run_forever
        File "/opt/conda/lib/python3.10/asyncio/", line 1909, in _run_once
        File "/opt/conda/lib/python3.10/asyncio/", line 80, in _run
, *self._args)
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 513, in dispatch_queue
          await self.process_one()
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 502, in process_one
          await dispatch(*args)
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 409, in dispatch_shell
          await result
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 729, in execute_request
          reply_content = await reply_content
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 422, in do_execute
          res = shell.run_cell(
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/", line 540, in run_cell
          return super().run_cell(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/", line 3009, in run_cell
          result = self._run_cell(
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/", line 3064, in _run_cell
          result = runner(coro)
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/", line 129, in _pseudo_sync_runner
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/", line 3269, in run_cell_async
          has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/", line 3448, in run_ast_nodes
          if await self.run_code(code, result, async_=asy):
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/", line 3508, in run_code
          exec(code_obj, self.user_global_ns, self.user_ns)
        File "/tmp/ipykernel_33/", line 7, in <module>
          history =, steps_per_epoch= steps_per_epoch, validation_data=val_it,
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1685, in fit
          tmp_logs = self.train_function(iterator)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1284, in train_function
          return step_function(self, iterator)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1268, in step_function
          outputs =, args=(data,))
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1249, in run_step
          outputs = model.train_step(data)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1050, in train_step
          y_pred = self(x, training=True)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 558, in __call__
          return super().__call__(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1145, in __call__
          outputs = call_fn(inputs, *args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/", line 96, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 512, in call
          return self._run_internal_graph(inputs, training=training, mask=mask)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 669, in _run_internal_graph
          outputs = node.layer(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/", line 1145, in __call__
          outputs = call_fn(inputs, *args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/", line 96, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/layers/convolutional/", line 188, in call
          outputs = tf.compat.v1.nn.separable_conv2d(
    Node: 'model_1/block6_sepconv2/separable_conv2d'
    OOM when allocating tensor with shape[728,728,1,1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
         [[{{node model_1/block6_sepconv2/separable_conv2d}}]]
    Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.


  • Nothing to do but reduce the batch size (unless you want to decrease the size of the image or use a smaller architecture).

    Batch size must not necesarily be a power of 2, you can try with batch_size 24, gradually increase from 16 until you reach the limit/fully utilize the GPU memory.