Inspired from Andrej Karpathy Char-RNN, There is a Tensorflow implementation of char-rnn sherjilozair/char-rnn-tensorflow: Multi-layer Recurrent Neural Networks (LSTM, RNN) for character-level language models in Python using Tensorflow. I want to implement bidirectional character level language model from this code. I change the model.py and wrote a simple code:
class Model:
def __init__(self, input_data, targets, seq_length=Config.max_seq_length, training=True):
if Config.model == 'rnn':
cell_fn = rnn.BasicRNNCell
elif Config.model == 'gru':
cell_fn = rnn.GRUCell
elif Config.model == 'lstm':
cell_fn = rnn.BasicLSTMCell
elif Config.model == 'nas':
cell_fn = rnn.NASCell
else:
raise Exception("model type not supported: {}".format(Config.model))
fw_cells = []
bw_cells = []
for _ in range(Config.num_layers):
fw_cell = cell_fn(Config.rnn_size)
bw_cell = cell_fn(Config.rnn_size)
fw_cells.append(fw_cell)
bw_cells.append(bw_cell)
self.fw_cell = rnn.MultiRNNCell(fw_cells, state_is_tuple=True)
self.bw_cell = rnn.MultiRNNCell(bw_cells, state_is_tuple=True)
self.input_data, self.targets = input_data, targets
with tf.variable_scope('rnnlm'):
softmax_w = tf.get_variable("softmax_w", [Config.rnn_size*2, Config.vocab_size])
softmax_b = tf.get_variable("softmax_b", [Config.vocab_size])
embedding = tf.get_variable("embedding", [Config.vocab_size, Config.rnn_size])
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
inputs = tf.unstack(inputs, num=seq_length, axis=1)
outputs, _, _ = tf.nn.static_bidirectional_rnn(self.fw_cell, self.bw_cell, inputs,
dtype=tf.float32, scope='rnnlm')
output = tf.reshape(tf.concat(outputs, 1), [-1, Config.rnn_size*2])
self.logits = tf.matmul(output, softmax_w) + softmax_b
self.probs = tf.nn.softmax(self.logits)
self.lr = tf.Variable(0.0, trainable=False)
if training:
loss = legacy_seq2seq.sequence_loss_by_example(
[self.logits],
[tf.reshape(self.targets, [-1])],
[tf.sign(tf.cast(tf.reshape(self.targets, [-1]), dtype=tf.float32))])
with tf.name_scope('cost'):
self.cost = tf.reduce_mean(loss)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), Config.grad_clip)
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.lr)
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
In training phase, I see a fast converge. After near 3000 iteration, the loss reach 0.003. In test phase, the probability of all character is 1.0. I think there is a mistake. I will so glad to get some help to find my mistake.
use the preceding and following output to predict the prob of current word. In your case, you used current rnn output to predict the prob of current word.