I have the following code to do some simple arithmetic calculations in distributed tensorflow. A minimal reproducible example would be:-
import tensorflow as tf
global_step_tensor = tf.Variable(10, trainable=False, name='global_step')
cluster = tf.train.ClusterSpec({"local": ["localhost:2222", "localhost:2223","localhost:2224", "localhost:2225"]})
x = tf.constant(2)
with tf.device("/job:local/task:0"):
y = x + 300
model = tf.global_variables_initializer()
saver = tf.train.Saver([y])
ChiefSessionCreator = tf.train.ChiefSessionCreator(scaffold=None, master='grpc://localhost:2222', config=None, checkpoint_dir='/home/chaitanya/tensorflow/codes/checkpoints')
saver_hook = tf.train.CheckpointSaverHook(checkpoint_dir='/home/chaitanya/tensorflow/codes/checkpoints', save_secs=10, save_steps=None, saver=y, checkpoint_basename='model.ckpt', scaffold=None)
summary_hook = tf.train.SummarySaverHook(save_steps=None, save_secs=10, output_dir='/home/chaitanya/tensorflow/codes/savepoints', summary_writer=None, scaffold=None, summary_op=y)
with tf.train.MonitoredTrainingSession(master='grpc://localhost:2222', is_chief=True, checkpoint_dir='/home/chaitanya/tensorflow/codes/checkpoints',
scaffold=None, hooks=[saver_hook, summary_hook], chief_only_hooks=None, save_checkpoint_secs=10, save_summaries_steps=None, config=None) as sess:
while not sess.should_stop():
sess.run(model)
while not sess.should_stop():
result = sess.run(y)
print(result)
The following is the error:-
Traceback (most recent call last):
File "add_1.py", line 13, in <module>
saver = tf.train.Saver([y])
raise TypeError("Variable to save is not a Variable: %s" % var)
TypeError: Variable to save is not a Variable: Tensor("add_3:0", shape=(), dtype=int32, device=/job:local/task:3)
Please help me figure out the correct way to use this function.
When you simply write x + 300
, you are not creating a tf.Variable
. You need to explicitly use tf.get_variable()
or tf.Variable()
to create a variable which can be saved.
y = tf.Variable(x + 300)