Search code examples
pythontensorflowmachine-learningkerasdataset

How do I load a dataset and process it without overloading RAM in Python?


My TensorFlow and Keras LSTM model is crashing from a RAM overload each time I try turning on Kaggle to begin the training process after expanding the dataset to 3.95 MB. I found that the dataset is too heavy to load in at once, even with a dataloader, which messed up training. I have searched for a solution, but I cannot find one. Any support would be much appreciated.

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys

with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
    text = file.read()

# A preview of the text file
vocabulary = sorted(list(set(text)))

char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

# Dividing the text into subsequences of length max_length
# So that at each time step the next max_length characters
# are fed into the network
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])

# Hot encoding each character into a boolean vector
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype=bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype=bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

# Building the LSTM network for the task
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# Helper function to sample an index from a probability array
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    if epoch % 30 == 0:
        print()
        print('----- Generating text after Epoch: % d' % epoch)

        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.2, 0.5, 1.0, 1.2]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

# Training the LSTM model
model.fit(X, y, batch_size=128, epochs=28, callbacks=callbacks)

def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

print(generate_text(500, 0.5))

Dataset is a text file with unorganized words and phrases. This AI serves as an autocomplete implementation.


Solution

  • Well you can always write your custom python generator to load your data into the model efficiently by using the keyword yield. I don't know if you ever have implemented something similar, but the concept is intuitive in building this generator. So write a function with all the parameters that you need inside it like path to file, batch_size, length vocabulary, ... Then do an infinite loop and inside it write your code/logic to load/write data into batches and then push them to the model.

    I wrote this snippet but sure it is not accurate enough and you need to review it:

    def text_generator(batch_size, max_length, steps, len_vocabulary):
        while True:
            with open('/kaggle/input/crptic-python/dataset.txt', 'r') as file:
                text = file.read()
                sentences = []
                next_chars = []
                for i in range(0, len(text) - max_length, steps):
                    sentences.append(text[i: i + max_length])
                    next_chars.append(text[i + max_length])
                    if len(sentences) == batch_size:
    
                        # Hot encoding each character into a boolean vector
                        X = np.zeros((len(sentences), max_length, len_vocabulary), dtype=bool)
                        y = np.zeros((len(sentences), len_vocabulary), dtype=bool)
                        for i, sentence in enumerate(sentences):
                            for t, char in enumerate(sentence):
                                X[i, t, char_to_indices[char]] = 1
                            y[i, char_to_indices[next_chars[i]]] = 1
                        yield X, y
                        sentence = []
                        next_chars = []