I have been optimizing the hyperparameters of several TensorFlow neural network models in Optuna on Jupyter Notebook (Python 3.x) in WSL with hundreds of trials and no prior problems until I thought I should save my studies for future reference.
I have a class within which I define the objective_function()
and optimize()
methods, and I modified the optimize_study()
method so I can dump studies into .pkl files:
def optimize_study(self):
from optuna.visualization import plot_optimization_history
from optuna.importance import get_param_importances
study = optuna.create_study(direction = "minimize", sampler = optuna.samplers.TPESampler(),
pruner = optuna.pruners.HyperbandPruner(), study_name=self.study_name)
study.optimize(self.objective_function, n_trials = self.n_trials, gc_after_trial=True)
# gc_after_trial added later
plot_optimization_history(study).show()
print(get_param_importances(study))
joblib.dump(study, f"{self.study_name}.pkl") # Line added later
return study.best_params, study.best_value
When I now run hyperparameter optimization trials (set to 5), the trials work well until the third trial, where I get:
E tensorflow/stream_executor/dnn.cc:868] CUDNN_STATUS_INTERNAL_ERROR
in tensorflow/stream_executor/cuda/cuda_dnn.cc(2683): 'cudnnRNNForwardTraining( cudnn.handle(), rnn_desc.handle(), model_dims.max_seq_length, input_desc.handles(), input_data.opaque(), input_h_desc.handle(), input_h_data.opaque(), input_c_desc.handle(), input_c_data.opaque(), rnn_desc.params_handle(), params.opaque(), output_desc.handles(), output_data->opaque(), output_h_desc.handle(), output_h_data->opaque(), output_c_desc.handle(), output_c_data->opaque(), workspace.opaque(), workspace.size(), reserve_space.opaque(), reserve_space.size())'
W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at cudnn_rnn_ops.cc:1563 : INTERNAL: Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 91, 81, 1, 1, 128, 81]
Trial 2 failed with parameters: {'units': 81, 'activation': 'softsign', 'dropout': 0.07633939325087957, 'optimizer': 'Adam', 'adam_learning_rate': 0.01799516104446331, 'filters': 91} because of the following error: InternalError().
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
value_or_values = func(trial)
File "/tmp/ipykernel_3632372/3676345196.py", line 167, in objective_function
self.neural_network.train_model(test_model)
File "/tmp/ipykernel_3632372/227766830.py", line 178, in train_model
history = model.fit(self.x_train, self.y_train, epochs = epoch_size, batch_size = BATCH_SIZE, callbacks = [early_stop],
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InternalError: Graph execution error:
Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 91, 81, 1, 1, 128, 81]
[[{{node CudnnRNN}}]]
[[sequential/lstm/PartitionedCall]] [Op:__inference_train_function_121807]
For a denser neural network architecture, I get a similar error on the first trial but with a very long memory allocation log indicating that the available memory is exhausted, something like:
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:56] Histogram of current allocation: (allocation_size_in_bytes, nb_allocation_of_that_sizes), ...;
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 4, 27
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 8, 8
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 272, 3
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 332, 3
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 512, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 544, 6
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 1028, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 7968, 4
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 12288, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 18496, 6
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 42496, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 45152, 6
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 93908, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 751264, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 16819712, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:90] CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: 67108864
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:92] CU_MEMPOOL_ATTR_USED_MEM_CURRENT: 18140216
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:93] CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: 67108864
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:94] CU_MEMPOOL_ATTR_USED_MEM_HIGH: 34937704
E tensorflow/stream_executor/dnn.cc:868] CUDNN_STATUS_INTERNAL_ERROR
in tensorflow/stream_executor/cuda/cuda_dnn.cc(2683): 'cudnnRNNForwardTraining( cudnn.handle(), rnn_desc.handle(), model_dims.max_seq_length, input_desc.handles(), input_data.opaque(), input_h_desc.handle(), input_h_data.opaque(), input_c_desc.handle(), input_c_data.opaque(), rnn_desc.params_handle(), params.opaque(), output_desc.handles(), output_data->opaque(), output_h_desc.handle(), output_h_data->opaque(), output_c_desc.handle(), output_c_data->opaque(), workspace.opaque(), workspace.size(), reserve_space.opaque(), reserve_space.size())'
W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at cudnn_rnn_ops.cc:1563 : INTERNAL: Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 83, 34, 1, 1, 128, 34]
E tensorflow/stream_executor/dnn.cc:868] CUDNN_STATUS_INTERNAL_ERROR
in tensorflow/stream_executor/cuda/cuda_dnn.cc(2683): 'cudnnRNNForwardTraining( cudnn.handle(), rnn_desc.handle(), model_dims.max_seq_length, input_desc.handles(), input_data.opaque(), input_h_desc.handle(), input_h_data.opaque(), input_c_desc.handle(), input_c_data.opaque(), rnn_desc.params_handle(), params.opaque(), output_desc.handles(), output_data->opaque(), output_h_desc.handle(), output_h_data->opaque(), output_c_desc.handle(), output_c_data->opaque(), workspace.opaque(), workspace.size(), reserve_space.opaque(), reserve_space.size())'
W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at cudnn_rnn_ops.cc:1563 : INTERNAL: Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 83, 34, 1, 1, 128, 34]
Trial 0 failed with parameters: {'units': 34, 'activation': 'softsign', 'dropout': 0.1391979014457847, 'optimizer': 'Adam', 'adam_learning_rate': 0.07514111264388643, 'filters': 83} because of the following error: InternalError().
I tried reverting my code to the previous state when I had no problems with hyperparameter optimizations by commenting out joblib.dump(study, f"{self.study_name}.pkl")
and gc_after_trial=True
. I still get the same errors as above. I did not change anything in my model training functions implemented in a separate class, an object of which I instantiate inside the class containing optimize_study()
.
I have never experienced this error before for over 8-10 models being optimized with 500 trials in one session with the GPU memory I have (~5GB), so I do not understand why the GPU memory is now insufficient. I feel some variable in some module/file has been set differently and cannot figure a way out.
I looked at this question on SO but I've been facing memory exhaustion only after adding the above two code blocks to optimize_study()
, so this seems different.
Any thoughts on why might this be occurring and how can I fix this?
EDIT: The memory allocation log shown is a truncated version of the original.
The exact code is too long to repost but here's how the code flows between an optimizer class O and NN class N (consider a GRU model):
O.optimize()
=> O.optimize_study()
=> O.objective_function()
=> N.build_GRU_model()
=> N.train_model()
=> N.predict()
=> N.evaluate_loss_function
Optimizer functions
def optimize(self):
best_params, best_values = self.optimize_study()
print(f"Best params: {best_params}\n Best value: {best_values}")
return self
def objective_function(self, trial):
units = trial.suggest_int('units', 10, 50)
activation = trial.suggest_categorical("activation", ['relu', 'tanh', 'softsign'])
dropout = trial.suggest_float('dropout', 0.01, 0.5)
test_model = self.build_deep_GRU_model(trial)
self.neural_network.train_model(test_model)
y_true, y_pred = self.neural_network.predict(test_model)
return self.neural_network.evaluate_loss_function(y_true, y_pred)
NN functions
def build_GRU_model(self, hidden_neurons, activator, drop_out, OPTIMIZER = 'adam'):
keras.backend.clear_session()
GRU_layer = keras.layers.GRU(hidden_neurons, dropout = drop_out, activation = activator)
gru_model = keras.Sequential(layers = (GRU_layer, keras.layers.Dense(self.output_neurons)))
gru_model.reset_states()
gru_model.compile(optimizer = OPTIMIZER, loss = self.mae)
return gru_model
def train_model(self, model, epoch_size = 150, BATCH_SIZE = BATCH_SIZE):
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 30, mode = 'min')
history = model.fit(self.x_train, self.y_train, epochs = epoch_size, batch_size = BATCH_SIZE, callbacks = [early_stop], validation_data = (self.x_valid, self.y_valid), shuffle = False)
print(model.summary())
return history
The predict and loss evaluator functions in objective_function()
simply undo the scaling and output the loss function for hyperparameter optimization. The exception is raised on the history = model.fit(...)
line.
It turned out that I had multiple Jupyter kernel sessions on a remote server running in the background, taking up memory usage even when I would disconnect from the server. The network architecture and addition of joblib.dump()
had nothing to do with this graph execution error. I verified that the problem did not replicate on my laptop GPU, and running an additional session on the server GPU led to OOM.