I added a print to the "discriminator_loss" function to see what was going on. at first it will tell me the shape of both are 16. later it tells me the shape of "real_loss" is only 15 while the other stays 16. So far I have only tried lowering the batchsize's and increasing them by 1 ect. I have provided the most relevant parts of the code. I can provide the rest of the code if needed. I have no clue why this is happening and it breaks the code.
with strategy.scope():
BATCH_SIZE = 16
GLOBAL_BATCH_SIZE = 32#batchsize*# of gpus
im_size = 256
latent_size = 512
with strategy.scope():
cross_entropy = tf.keras.losses.BinaryCrossentropy(
from_logits=True,\
reduction = tf.keras.losses.Reduction.NONE)
#this is used to evaluate discriminators ability to discriminate
def discriminator_loss(real_output, fake_output):
real_loss = cross_entropy(tf.ones_like(real_output), real_output)#compares prediction to actual value of 1
fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)#compares rediction to actual value of 0
print(real_loss)
print(fake_loss)
total_loss = real_loss + fake_loss
total_loss = total_loss/GLOBAL_BATCH_SIZE
return total_loss
#how well was generator able to trick discriminator
def generator_loss(fake_output):
gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)#compares predictions to the expected value 1 of a real image
gen_loss = gen_loss / GLOBAL_BATCH_SIZE
return gen_loss
with strategy.scope():
EPOCHS = 80
noise_dim = 512
num_examples_to_generate = 32
# We will reuse this seed overtime (so it's easier)
# to visualize progress in the animated GIF)
with strategy.scope():
def noise(n):
return tf.random.normal([n, latent_size])
def noiseImage(n):
return tf.random.uniform([n, im_size, im_size, 1])
#seed = tf.random.normal([num_examples_to_generate, noise_dim])
#seed used to generate image>the discriminator than classifies real images from training set and a set of generated images>loss is calculated and gradients are used to update the model
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
with strategy.scope():
#@tf.function
def train_step(images):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
real_output = discriminator(images, training=True)
fake_output = discriminator(generated_images, training=True)
g_loss = generator_loss(fake_output)#runs generator loss
d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
D_grads = disc_tape.gradient(d_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(D_grads, discriminator.trainable_variables))
#run g_optim twice to make sure d_loss doesn't go to zero
with tf.GradientTape() as gen_tape:
generated_imgs = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
fake_output = discriminator(generated_imgs, training=True)
g_loss = generator_loss(fake_output)
G_grads = gen_tape.gradient(g_loss, generator.trainable_variables)
generator_optimizer.apply_gradients(zip(G_grads, generator.trainable_variables))
return g_loss, d_loss
@tf.function
def distributed_train_step(dist_dataset):
per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
total_g_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_g_losses,axis=0)
total_d_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_d_losses,axis=0)
return total_g_loss, total_d_loss
with strategy.scope():
def train(dist_dataset, epochs):
for epoch in range(epochs):
start = time.time()
for image_batch in dist_dataset:
total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
with strategy.scope():
train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
error and traceback
Traceback (most recent call last):
File "C:\image generator\pixiv\#image generator.py", line 507, in <module>
train(dist_dataset, EPOCHS)#in some cases can take up to 20000 epochs to train well
File "C:\image generator\pixiv\#image generator.py", line 441, in train
total_g_loss, total_d_loss = distributed_train_step(image_batch)#runs train_step function
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2419, in __call__
graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2777, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\function.py", line 2667, in _create_graph_function
capture_by_value=self._capture_by_value),
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 981, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py", line 441, in wrapped_fn
return weak_wrapped_fn().__wrapped__(*args, **kwds)
File "C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 968, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
C:\image generator\pixiv\#image generator.py:419 distributed_train_step *
per_replica_g_losses, per_replica_d_losses = strategy.run(train_step, args=(dist_dataset,))
C:\image generator\pixiv\#image generator.py:393 train_step *
d_loss = discriminator_loss(real_output, fake_output)#runs disc loss
C:\image generator\pixiv\#image generator.py:328 discriminator_loss *
total_loss = real_loss + fake_loss
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:984 binary_op_wrapper
return func(x, y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:1276 _add_dispatch
return gen_math_ops.add_v2(x, y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py:483 add_v2
"AddV2", x=x, y=y, name=name)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:744 _apply_op_helper
attrs=attr_protos, op_def=op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:595 _create_op_internal
compute_device)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:3327 _create_op_internal
op_def=op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1817 __init__
control_input_ops, op_def)
C:\Users\will\miniconda3\lib\site-packages\tensorflow\python\framework\ops.py:1657 _create_c_op
raise ValueError(str(e))
ValueError: Dimensions must be equal, but are 0 and 2 for '{{node replica_1/add}} = AddV2[T=DT_FLOAT](replica_1/binary_crossentropy_1/weighted_loss/Mul, replica_1/binary_crossentropy_2/weighted_loss/Mul)' with input shapes: [0], [2].
So according to comments the problem lies in unequal batch sizes, due to the final batch being smaller than the specified batch size. I believe this is due to this line:
generated_images = generator((noise(BATCH_SIZE), noiseImage(BATCH_SIZE), np.ones([BATCH_SIZE,1])), training=True)
where the constant size BATCH_SIZE
is used, instead of the actual input shape of the batch, so that generated_images
is of a different shape than images
.
So one solution as mentioned is simply to use drop_remainder=True
in batch()
. However it might be better to get the generator to output images of the same shape as the input, so instead of passing BATCH_SIZE
as argument to your noise generation functions, you should use the actual size of the input batch. So maybe using tf.shape(images)[0]
would help. Alternatively, you could generate a fixed batch of images with BATCH_SIZE
, and then simply discard any extra images, like
num_images = tf.shape(images)[0]
generated_images = generated_images[:num_images]