Search code examples
tensorflowgoogle-colaboratorytensorflow2.0tpu

How to clear Colab Tensorflow TPU memory


I am executing model for several folds. After each fold I want to clear the TPU memory so that I don't get OOM error.

Full trace of the current error.

ResourceExhaustedError                    Traceback (most recent call last)
<ipython-input-16-b7e0725f5c4d> in <module>()
      1 tf.keras.backend.clear_session()
      2 with config.strategy.scope():
----> 3   model = build_model(config.img_size, count = count_data_items(files_train)/config.batch_size)

25 frames
<ipython-input-9-5b219db28f69> in build_model(size, count)
      1 def build_model(size, count=820):
      2 
----> 3     base_model = efn.EfficientNetB7(input_shape=(size,size,3),weights='imagenet',include_top=False)
      4 
      5     model = tf.keras.Sequential([

/usr/local/lib/python3.7/dist-packages/efficientnet/__init__.py in wrapper(*args, **kwargs)
     55         kwargs['models'] = tfkeras.models
     56         kwargs['utils'] = tfkeras.utils
---> 57         return func(*args, **kwargs)
     58 
     59     return wrapper

/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNetB7(include_top, weights, input_tensor, input_shape, pooling, classes, **kwargs)
    604         input_tensor=input_tensor, input_shape=input_shape,
    605         pooling=pooling, classes=classes,
--> 606         **kwargs
    607     )
    608 

/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNet(width_coefficient, depth_coefficient, default_resolution, dropout_rate, drop_connect_rate, depth_divisor, blocks_args, model_name, include_top, weights, input_tensor, input_shape, pooling, classes, **kwargs)
    348                       use_bias=False,
    349                       kernel_initializer=CONV_KERNEL_INITIALIZER,
--> 350                       name='stem_conv')(x)
    351     x = layers.BatchNormalization(axis=bn_axis, name='stem_bn')(x)
    352     x = layers.Activation(activation, name='stem_activation')(x)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
    968     if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
    969       return self._functional_construction_call(inputs, args, kwargs,
--> 970                                                 input_list)
    971 
    972     # Maintains info about the `Layer.call` stack.

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _functional_construction_call(self, inputs, args, kwargs, input_list)
   1106       # Check input assumptions set after layer building, e.g. input shape.
   1107       outputs = self._keras_tensor_symbolic_call(
-> 1108           inputs, input_masks, args, kwargs)
   1109 
   1110       if outputs is None:

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _keras_tensor_symbolic_call(self, inputs, input_masks, args, kwargs)
    838       return nest.map_structure(keras_tensor.KerasTensor, output_signature)
    839     else:
--> 840       return self._infer_output_signature(inputs, args, kwargs, input_masks)
    841 
    842   def _infer_output_signature(self, inputs, args, kwargs, input_masks):

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _infer_output_signature(self, inputs, args, kwargs, input_masks)
    876           # overridden).
    877           # TODO(kaftan): do we maybe_build here, or have we already done it?
--> 878           self._maybe_build(inputs)
    879           inputs = self._maybe_cast_inputs(inputs)
    880           outputs = call_fn(inputs, *args, **kwargs)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _maybe_build(self, inputs)
   2623         # operations.
   2624         with tf_utils.maybe_init_scope(self):
-> 2625           self.build(input_shapes)  # pylint:disable=not-callable
   2626       # We must set also ensure that the layer is marked as built, and the build
   2627       # shape is stored since user defined build functions may not be calling

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/convolutional.py in build(self, input_shape)
    202         constraint=self.kernel_constraint,
    203         trainable=True,
--> 204         dtype=self.dtype)
    205     if self.use_bias:
    206       self.bias = self.add_weight(

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in add_weight(self, name, shape, dtype, initializer, regularizer, trainable, constraint, use_resource, synchronization, aggregation, **kwargs)
    653         synchronization=synchronization,
    654         aggregation=aggregation,
--> 655         caching_device=caching_device)
    656     if regularizer is not None:
    657       # TODO(fchollet): in the future, this should be handled at the

/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/tracking/base.py in _add_variable_with_custom_getter(self, name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
    813         dtype=dtype,
    814         initializer=initializer,
--> 815         **kwargs_for_getter)
    816 
    817     # If we set an initializer and the variable processed it, tracking will not

/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer_utils.py in make_variable(name, shape, dtype, initializer, trainable, caching_device, validate_shape, constraint, use_resource, collections, synchronization, aggregation, partitioner)
    137       synchronization=synchronization,
    138       aggregation=aggregation,
--> 139       shape=variable_shape if variable_shape else None)
    140 
    141 

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls, *args, **kwargs)
    258   def __call__(cls, *args, **kwargs):
    259     if cls is VariableV1:
--> 260       return cls._variable_v1_call(*args, **kwargs)
    261     elif cls is Variable:
    262       return cls._variable_v2_call(*args, **kwargs)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in _variable_v1_call(cls, initial_value, trainable, collections, validate_shape, caching_device, name, variable_def, dtype, expected_shape, import_scope, constraint, use_resource, synchronization, aggregation, shape)
    219         synchronization=synchronization,
    220         aggregation=aggregation,
--> 221         shape=shape)
    222 
    223   def _variable_v2_call(cls,

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in getter(**kwargs)
     65 
     66   def getter(**kwargs):
---> 67     return captured_getter(captured_previous, **kwargs)
     68 
     69   return getter

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in creator_with_resource_vars(next_creator, **kwargs)
   2109         checkpoint_restore_uid = None
   2110 
-> 2111       created = self._create_variable(next_creator, **kwargs)
   2112 
   2113       if checkpoint_restore_uid is not None:

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _create_variable(self, next_creator, **kwargs)
   1167         self._container_strategy(), _real_mirrored_creator,
   1168         distribute_utils.TPU_VARIABLE_CLASS_MAPPING,
-> 1169         distribute_utils.TPU_VARIABLE_POLICY_MAPPING, **kwargs)
   1170 
   1171   def _gather_to_implementation(self, value, destinations, axis, options):

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_utils.py in create_mirrored_variable(strategy, real_mirrored_creator, class_mapping, policy_mapping, **kwargs)
    304   # here.
    305   with tape.stop_recording():
--> 306     value_list = real_mirrored_creator(**kwargs)
    307     # MirroredVariable is recreated during saved_model loading, and its
    308     # component variables (value_list) will have None initializer. We

/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _real_mirrored_creator(**kwargs)
   1158 
   1159           with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
-> 1160             v = next_creator(**kwargs)
   1161 
   1162           assert not isinstance(v, tpu_values.TPUMirroredVariable)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in <lambda>(**kwargs)
    197                         shape=None):
    198     """Call on Variable class. Useful to force the signature."""
--> 199     previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
    200     for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
    201       previous_getter = _make_getter(getter, previous_getter)

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variable_scope.py in default_variable_creator(next_creator, **kwargs)
   2624         synchronization=synchronization,
   2625         aggregation=aggregation,
-> 2626         shape=shape)
   2627   else:
   2628     return variables.RefVariable(

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls, *args, **kwargs)
    262       return cls._variable_v2_call(*args, **kwargs)
    263     else:
--> 264       return super(VariableMetaclass, cls).__call__(*args, **kwargs)
    265 
    266 

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in __init__(self, initial_value, trainable, collections, validate_shape, caching_device, name, dtype, variable_def, import_scope, constraint, distribute_strategy, synchronization, aggregation, shape)
   1593           aggregation=aggregation,
   1594           shape=shape,
-> 1595           distribute_strategy=distribute_strategy)
   1596 
   1597   def _init_from_args(self,

/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in _init_from_args(self, initial_value, trainable, collections, caching_device, name, dtype, constraint, synchronization, aggregation, distribute_strategy, shape)
   1729                                                   dtype=dtype)
   1730           if shape is not None:
-> 1731             if not initial_value.shape.is_compatible_with(shape):
   1732               raise ValueError(
   1733                   "The initial value's shape (%s) is not compatible with "

/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in shape(self)
   1196         # `_tensor_shape` is declared and defined in the definition of
   1197         # `EagerTensor`, in C.
-> 1198         self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
   1199       except core._NotOkStatusException as e:
   1200         six.raise_from(core._status_to_exception(e.code, e.message), None)

ResourceExhaustedError: Failed to allocate request for 18.0KiB (18432B) on device ordinal 0

Solution

  • I use tf.tpu.experimental.initialize_tpu_system(hw_accelerator_handle) when I perform hyperparameter tuning on TPU and want to release memory between two sessions of training. It resets your TPU while maintaining the connection to the TPU. In my usecase I start training from scratch each time, probably it still works for your use case.

    hw_accelerator_handle is the object returned by tf.distribute.cluster_resolver.TPUClusterResolver()