Search code examples
google-colaboratorygoogle-cloud-tputpu

Colab TPU: TensorFlow '2.0.0-beta0' LinearClassifier .train Bug


Attempting to get LinearClassifier running with Colab TPU. https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/estimator/LinearClassifier

TPUStrategy is supported in TensorFlow 2.0 Estimator(LinearClassifier) https://www.tensorflow.org/beta/guide/distribute_strategy#whats_supported_now_2

LinearClassifier works as expected without the tpu_strategy. https://www.tensorflow.org/beta/guide/distribute_strategy#tpustrategy

When adding the tpu_strategy as the config for LinearClassifier i'm getting the following error:

InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by {{node input0}}with these attrs: [T=DT_DOUBLE, N=8] Registered devices: [CPU, XLA_CPU] Registered kernels: [[input0]] https://www.tensorflow.org/beta/guide/distribute_strategy#using_tfdistributestrategy_with_estimator

Have been fighting this for a few days now, what is wrong here?

!pip install tensorflow==2.0.0-beta0
import tensorflow.feature_column as fc
import tensorflow as tf
import os

print(tf.__version__)

TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER)
tf.config.experimental_connect_to_host(cluster_resolver.master())
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
config = tf.estimator.RunConfig(train_distribute=tpu_strategy, eval_distribute=tpu_strategy)

batch_size = 1
def make_input_fn(X, y):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)).batch(batch_size)
    return dataset
  return input_fn

input_fn = make_input_fn(estimator_train_attributes_dictionary,labels_train)
linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)
linear_est.train(input_fn=input_fn)

full exception in Colab:

W0618 18:08:10.280844 140506166175616 estimator.py:1811] Using temporary folder as model directory: /tmp/tmp2xc1fixj
2.0.0-beta0
W0618 18:09:00.986362 140506166175616 tpu.py:218] 3 unsupported operations found: 
  ScalarSummary (bias)
  ScalarSummary (fraction_of_zero_weights)
  ScalarSummary (loss)
W0618 18:09:43.578035 140506166175616 tpu_strategy_util.py:57] TPU system %s has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1355     try:
-> 1356       return fn(*args)
   1357     except errors.OpError as e:

20 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
   1338       # Ensure any changes to the graph are reflected in the runtime.
-> 1339       self._extend_graph()
   1340       return self._call_tf_sessionrun(

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _extend_graph(self)
   1373     with self._graph._session_run_lock():  # pylint: disable=protected-access
-> 1374       tf_session.ExtendSession(self._session)
   1375 

InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by {{node input0}}with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  <no registered kernels>

     [[input0]]

During handling of the above exception, another exception occurred:

InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-23-66caf93d8677> in <module>()
     25 
     26 linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)#feature_columns=featureNames,,config=config
---> 27 linear_est.train(input_fn=input_fn)#,max_steps=100
     28 
     29 #train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=1000)

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    365 
    366       saving_listeners = _check_listeners_type(saving_listeners)
--> 367       loss = self._train_model(input_fn, hooks, saving_listeners)
    368       logging.info('Loss for final step: %s.', loss)
    369       return self

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
   1154   def _train_model(self, input_fn, hooks, saving_listeners):
   1155     if self._train_distribution:
-> 1156       return self._train_model_distributed(input_fn, hooks, saving_listeners)
   1157     else:
   1158       return self._train_model_default(input_fn, hooks, saving_listeners)

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model_distributed(self, input_fn, hooks, saving_listeners)
   1217       self._config._train_distribute.configure(self._config.session_config)
   1218       return self._actual_train_model_distributed(
-> 1219           self._config._train_distribute, input_fn, hooks, saving_listeners)
   1220     # pylint: enable=protected-access
   1221 

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _actual_train_model_distributed(self, strategy, input_fn, hooks, saving_listeners)
   1327         return self._train_with_estimator_spec(estimator_spec, worker_hooks,
   1328                                                hooks, global_step_tensor,
-> 1329                                                saving_listeners)
   1330 
   1331   def _train_with_estimator_spec_distributed(self, estimator_spec, worker_hooks,

/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
   1478         save_summaries_steps=save_summary_steps,
   1479         config=self._session_config,
-> 1480         log_step_count_steps=log_step_count_steps) as mon_sess:
   1481       loss = None
   1482       any_step_done = False

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in MonitoredTrainingSession(master, is_chief, checkpoint_dir, scaffold, hooks, chief_only_hooks, save_checkpoint_secs, save_summaries_steps, save_summaries_secs, config, stop_grace_period_secs, log_step_count_steps, max_wait_secs, save_checkpoint_steps, summary_dir)
    582       session_creator=session_creator,
    583       hooks=all_hooks,
--> 584       stop_grace_period_secs=stop_grace_period_secs)
    585 
    586 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, stop_grace_period_secs)
   1005         hooks,
   1006         should_recover=True,
-> 1007         stop_grace_period_secs=stop_grace_period_secs)
   1008 
   1009 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs)
    723         stop_grace_period_secs=stop_grace_period_secs)
    724     if should_recover:
--> 725       self._sess = _RecoverableSession(self._coordinated_creator)
    726     else:
    727       self._sess = self._coordinated_creator.create_session()

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, sess_creator)
   1198     """
   1199     self._sess_creator = sess_creator
-> 1200     _WrappedSession.__init__(self, self._create_session())
   1201 
   1202   def _create_session(self):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in _create_session(self)
   1203     while True:
   1204       try:
-> 1205         return self._sess_creator.create_session()
   1206       except _PREEMPTION_ERRORS as e:
   1207         logging.info(

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
    869       """Creates a coordinated session."""
    870       # Keep the tf_sess for unit testing.
--> 871       self.tf_sess = self._session_creator.create_session()
    872       # We don't want coordinator to suppress any exception.
    873       self.coord = coordinator.Coordinator(clean_stop_exception_types=[])

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
    645         init_op=self._scaffold.init_op,
    646         init_feed_dict=self._scaffold.init_feed_dict,
--> 647         init_fn=self._scaffold.init_fn)
    648 
    649 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/session_manager.py in prepare_session(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)
    294                            "init_fn or local_init_op was given")
    295       if init_op is not None:
--> 296         sess.run(init_op, feed_dict=init_feed_dict)
    297       if init_fn:
    298         init_fn(sess)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    948     try:
    949       result = self._run(None, fetches, feed_dict, options_ptr,
--> 950                          run_metadata_ptr)
    951       if run_metadata:
    952         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1171     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1172       results = self._do_run(handle, final_targets, final_fetches,
-> 1173                              feed_dict_tensor, options, run_metadata)
   1174     else:
   1175       results = []

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1348     if handle is None:
   1349       return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1350                            run_metadata)
   1351     else:
   1352       return self._do_call(_prun_fn, handle, feeds, fetches)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1368           pass
   1369       message = error_interpolation.interpolate(message, self._graph)
-> 1370       raise type(e)(node_def, op, message)
   1371 
   1372   def _extend_graph(self):


InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by node input0 (defined at <ipython-input-23-66caf93d8677>:27) with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  <no registered kernels>

     [[input0]]

Solution

  • Installing TF 2.0 in colab is supported (with the !pip command), however, the TPUs themselves do not get the installed TF version. If you want to work with TPUs in colab, use the TF version that comes by default.