I'm following the tutorial of TensorFlow_Federated: custom_federated_algorithms_2. Everything works when I just copy and run the tutorial's code. So I wanna change the code by myself for being more familar with tff. Then bug appeared.
My runtime environment:
python: 3.8.12
tensorflow: 2.5.0
tensorflow_federated: 0.19.0
Code below is the orginal code of testing model in tutorial:
MODEL_SPEC = collections.OrderedDict(
weights=tf.TensorSpec(shape=[784, 10], dtype=tf.float32),
bias=tf.TensorSpec(shape=[10], dtype=tf.float32))
MODEL_TYPE = tff.to_type(MODEL_SPEC)
print(MODEL_TYPE) # <weights=float32[784,10],bias=float32[10]>
BATCH_SPEC = collections.OrderedDict(
x=tf.TensorSpec(shape=[None, 784], dtype=tf.float32),
y=tf.TensorSpec(shape=[None], dtype=tf.int32)
)
BATCH_TYPE = tff.to_type(BATCH_SPEC)
print(BATCH_TYPE) # <x=float32[?,784],y=int32[?]>
And I changed the MODEL_TYPE
into:
MODEL_SPEC = collections.OrderedDict(
fc1=tf.TensorSpec(shape=[784, 256], dtype=tf.float32),
b1=tf.TensorSpec(shape=[256], dtype=tf.float32),
fc2=tf.TensorSpec(shape=[256, 128], dtype=tf.float32),
b2=tf.TensorSpec(shape=[128], dtype=tf.float32),
fc3=tf.TensorSpec(shape=[128, 10], dtype=tf.float32),
b3=tf.TensorSpec(shape=[10], dtype=tf.float32)
)
MODEL_TYPE = tff.to_type(MODEL_SPEC)
Thanks to the structure of model changed, the process of forward pass needs to be changed too:
# original
@tf.function
def forward_pass(model, batch):
predicted_y = tf.nn.softmax(
tf.matmul(batch['x'], model['weights']) + model['bias'])
return -tf.reduce_mean(
tf.reduce_sum(
tf.one_hot(batch['y'], 10) * tf.math.log(predicted_y), axis=[1]))
@tff.tf_computation(MODEL_TYPE, BATCH_TYPE)
def batch_loss(model, batch):
return forward_pass(model, batch)
# new
@tf.function
def forward(model, batch):
logits = batch["x"] @ model["fc1"] + model["b1"]
logits = logits @ model["fc2"] + model["b2"]
logits = logits @ model["fc3"] + model["b3"]
logits = tf.nn.softmax(logits, axis=-1,)
one_hot_y = tf.one_hot(batch["y"], depth=10)
return -tf.reduce_mean(tf.reduce_sum(tf.math.log(logits) * one_hot_y, axis=[1]))
@tff.tf_computation(MODEL_TYPE, BATCH_TYPE)
def batch_loss(model, batch):
return forward(model, batch)
I didn't change the batch_train()
code.
@tff.tf_computation(MODEL_TYPE, BATCH_TYPE, tf.float32)
def batch_train(initial_model, batch, learning_rate):
# Define a group of model variables and set them to `initial_model`. Must
# be defined outside the @tf.function.
model_vars = collections.OrderedDict([
(name, tf.Variable(name=name, initial_value=value))
for name, value in initial_model.items()
])
optimizer = tf.keras.optimizers.SGD(learning_rate)
@tf.function
def _train_on_batch(model_vars, batch):
# Perform one step of gradient descent using loss from `batch_loss`.
with tf.GradientTape() as tape:
loss = forward_pass(model_vars, batch)
grads = tape.gradient(loss, model_vars)
optimizer.apply_gradients(
zip(tf.nest.flatten(grads), tf.nest.flatten(model_vars)))
return model_vars
return _train_on_batch(model_vars, batch)
And it works fine so far. But when implementing the local_train()
section, errors appeared even I just using the original code.
initial_model = collections.OrderedDict(
fc1=tf.zeros([784, 256]),
b1=tf.zeros([256]),
fc2=tf.zeros([256,128]),
b2=tf.zeros([128]),
fc3=tf.zeros([128, 10]),
b3=tf.zeros([10])
)
LOCAL_DATA_TYPE = tff.SequenceType(BATCH_TYPE)
@tff.federated_computation(MODEL_TYPE, tf.float32, LOCAL_DATA_TYPE)
def local_train(initial_model, learning_rate, all_batches):
@tff.tf_computation(LOCAL_DATA_TYPE, tf.float32)
def _insert_learning_rate_to_sequence(dataset, learning_rate):
return dataset.map(lambda x: (x, learning_rate))
batches_with_learning_rate = _insert_learning_rate_to_sequence(all_batches, learning_rate)
# Mapping function to apply to each batch.
@tff.federated_computation(MODEL_TYPE, batches_with_learning_rate.type_signature.element)
def batch_fn(model, batch_with_lr):
batch, lr = batch_with_lr
return batch_train(model, batch, lr)
return tff.sequence_reduce(batches_with_learning_rate, initial_model, batch_fn)
locally_trained_model = local_train(initial_model, 1e-1, mnist_train_dataset[5])
# ValueError: Unable to unpack value [] as a tf.compat.v1.GraphDef
Finally, I found that I had made a low-level mistake.🤦♂️ Which is I coded on my custom jupyter notebook, but forgot to add the following key code in the tutorial at the begining:
executor_factory = tff.framework.local_executor_factory(
support_sequence_ops=True
)
execution_context = tff.framework.ExecutionContext(
executor_fn=executor_factory
)
tff.framework.set_default_context(execution_context)