I'm specifying a network with regularization from dropout. But I'm having trouble understanding how the dropout is being processed here. Specifically, why isn't the difference between the number of zeros before and after applying dropout exactly equal to the dropout proportion?
class DropoutDenseNetwork(tf.Module):
def __init__(self, name=None):
super(DropoutDenseNetwork, self).__init__(name=name)
self.dense_layer1 = Dense(32)
self.dropout = tf.keras.layers.Dropout(0.2)
self.dense_layer2 = Dense(10, activation=tf.identity)
@tf.function
def __call__(self, x, is_training):
embed = self.dense_layer1(x)
propn_zero_before = tf.reduce_mean(tf.cast(tf.equal(embed, 0.), tf.float32))
embed = self.dropout(embed, is_training)
propn_zero_after = tf.reduce_mean(tf.cast(tf.equal(embed, 0.), tf.float32))
tf.print('Zeros before and after:', propn_zero_before, "and", propn_zero_after)
output = self.dense_layer2(embed)
return output
if 'drop_dense_net' not in locals():
drop_dense_net = DropoutDenseNetwork()
drop_dense_net(tf.ones([1, 100]), tf.constant(True))
Because rate
is just the probability that any neuron will be dropped during training. It won't always land on exactly 0.2 of the data, especially with just 32 units. If you increase the number of units (e.g., 100,000), you'll see that it gets closer to rate
of 0.2:
import tensorflow as tf
from tensorflow.keras.layers import *
class DropoutDenseNetwork(tf.Module):
def __init__(self, name=None):
super(DropoutDenseNetwork, self).__init__(name=name)
self.dense_layer1 = Dense(100000)
self.dropout = tf.keras.layers.Dropout(0.2)
self.dense_layer2 = Dense(1)
@tf.function
def __call__(self, x, is_training):
embed = self.dense_layer1(x)
propn_zero_before = tf.reduce_mean(tf.cast(tf.equal(embed, 0.), tf.float32))
embed = self.dropout(embed, is_training)
propn_zero_after = tf.reduce_mean(tf.cast(tf.equal(embed, 0.), tf.float32))
tf.print('Zeros before and after:', propn_zero_before, "and", propn_zero_after)
drop_dense_net = DropoutDenseNetwork()
drop_dense_net(tf.ones([1, 10]), tf.constant(True))
Zeros before and after: 0 and 0.19954
Under the hood, tf.keras.layers.Dropout
uses tf.nn.dropout
. The documentation for the later explain:
rate: The probability that each element is dropped
In the source code, you can see that it creates a mask of random values between of the same shape as the input, and selects values that are higher than rate. Of course, not exactly 0.2 of the values will be higher than rate
:
random_tensor = random_ops.random_uniform(
noise_shape, seed=seed, dtype=x_dtype)
keep_mask = random_tensor >= rate