How to solve this InternalError: Graph execution error while optimizing hyperparameters in Optuna?

I have been optimizing the hyperparameters of several TensorFlow neural network models in Optuna on Jupyter Notebook (Python 3.x) in WSL with hundreds of trials and no prior problems until I thought I should save my studies for future reference. I have a class within which I define the objective_function() and optimize() methods, and I modified the optimize_study() method so I can dump studies into .pkl files:

def optimize_study(self):
        from optuna.visualization import plot_optimization_history
        from optuna.importance import get_param_importances
        study = optuna.create_study(direction = "minimize", sampler = optuna.samplers.TPESampler(),
                                    pruner = optuna.pruners.HyperbandPruner(), study_name=self.study_name)
        study.optimize(self.objective_function, n_trials = self.n_trials, gc_after_trial=True)
        # gc_after_trial added later
        plot_optimization_history(study).show()
        print(get_param_importances(study))
        joblib.dump(study, f"{self.study_name}.pkl") # Line added later
        return study.best_params, study.best_value

When I now run hyperparameter optimization trials (set to 5), the trials work well until the third trial, where I get:

E tensorflow/stream_executor/dnn.cc:868] CUDNN_STATUS_INTERNAL_ERROR
in tensorflow/stream_executor/cuda/cuda_dnn.cc(2683): 'cudnnRNNForwardTraining( cudnn.handle(), rnn_desc.handle(), model_dims.max_seq_length, input_desc.handles(), input_data.opaque(), input_h_desc.handle(), input_h_data.opaque(), input_c_desc.handle(), input_c_data.opaque(), rnn_desc.params_handle(), params.opaque(), output_desc.handles(), output_data->opaque(), output_h_desc.handle(), output_h_data->opaque(), output_c_desc.handle(), output_c_data->opaque(), workspace.opaque(), workspace.size(), reserve_space.opaque(), reserve_space.size())'

W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at cudnn_rnn_ops.cc:1563 : INTERNAL: Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 91, 81, 1, 1, 128, 81] 

Trial 2 failed with parameters: {'units': 81, 'activation': 'softsign', 'dropout': 0.07633939325087957, 'optimizer': 'Adam', 'adam_learning_rate': 0.01799516104446331, 'filters': 91} because of the following error: InternalError().

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_3632372/3676345196.py", line 167, in objective_function
    self.neural_network.train_model(test_model)
  File "/tmp/ipykernel_3632372/227766830.py", line 178, in train_model
    history = model.fit(self.x_train, self.y_train, epochs = epoch_size, batch_size = BATCH_SIZE, callbacks = [early_stop],
  File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
    tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InternalError: Graph execution error:

Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 91, 81, 1, 1, 128, 81] 
     [[{{node CudnnRNN}}]]
     [[sequential/lstm/PartitionedCall]] [Op:__inference_train_function_121807]

For a denser neural network architecture, I get a similar error on the first trial but with a very long memory allocation log indicating that the available memory is exhausted, something like:

E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:56] Histogram of current allocation: (allocation_size_in_bytes, nb_allocation_of_that_sizes), ...;
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 4, 27
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 8, 8
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 272, 3
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 332, 3
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 512, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 544, 6
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 1028, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 7968, 4
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 12288, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 18496, 6
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 42496, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 45152, 6
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 93908, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 751264, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:59] 16819712, 1
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:90] CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: 67108864
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:92] CU_MEMPOOL_ATTR_USED_MEM_CURRENT: 18140216
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:93] CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: 67108864
E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:94] CU_MEMPOOL_ATTR_USED_MEM_HIGH: 34937704
E tensorflow/stream_executor/dnn.cc:868] CUDNN_STATUS_INTERNAL_ERROR
in tensorflow/stream_executor/cuda/cuda_dnn.cc(2683): 'cudnnRNNForwardTraining( cudnn.handle(), rnn_desc.handle(), model_dims.max_seq_length, input_desc.handles(), input_data.opaque(), input_h_desc.handle(), input_h_data.opaque(), input_c_desc.handle(), input_c_data.opaque(), rnn_desc.params_handle(), params.opaque(), output_desc.handles(), output_data->opaque(), output_h_desc.handle(), output_h_data->opaque(), output_c_desc.handle(), output_c_data->opaque(), workspace.opaque(), workspace.size(), reserve_space.opaque(), reserve_space.size())'
W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at cudnn_rnn_ops.cc:1563 : INTERNAL: Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 83, 34, 1, 1, 128, 34] 
E tensorflow/stream_executor/dnn.cc:868] CUDNN_STATUS_INTERNAL_ERROR
in tensorflow/stream_executor/cuda/cuda_dnn.cc(2683): 'cudnnRNNForwardTraining( cudnn.handle(), rnn_desc.handle(), model_dims.max_seq_length, input_desc.handles(), input_data.opaque(), input_h_desc.handle(), input_h_data.opaque(), input_c_desc.handle(), input_c_data.opaque(), rnn_desc.params_handle(), params.opaque(), output_desc.handles(), output_data->opaque(), output_h_desc.handle(), output_h_data->opaque(), output_c_desc.handle(), output_c_data->opaque(), workspace.opaque(), workspace.size(), reserve_space.opaque(), reserve_space.size())'
W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at cudnn_rnn_ops.cc:1563 : INTERNAL: Failed to call ThenRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 83, 34, 1, 1, 128, 34] 
Trial 0 failed with parameters: {'units': 34, 'activation': 'softsign', 'dropout': 0.1391979014457847, 'optimizer': 'Adam', 'adam_learning_rate': 0.07514111264388643, 'filters': 83} because of the following error: InternalError().

I tried reverting my code to the previous state when I had no problems with hyperparameter optimizations by commenting out joblib.dump(study, f"{self.study_name}.pkl") and gc_after_trial=True. I still get the same errors as above. I did not change anything in my model training functions implemented in a separate class, an object of which I instantiate inside the class containing optimize_study().

I have never experienced this error before for over 8-10 models being optimized with 500 trials in one session with the GPU memory I have (~5GB), so I do not understand why the GPU memory is now insufficient. I feel some variable in some module/file has been set differently and cannot figure a way out. I looked at this question on SO but I've been facing memory exhaustion only after adding the above two code blocks to optimize_study(), so this seems different.

Any thoughts on why might this be occurring and how can I fix this?

EDIT: The memory allocation log shown is a truncated version of the original. The exact code is too long to repost but here's how the code flows between an optimizer class O and NN class N (consider a GRU model): O.optimize() => O.optimize_study() => O.objective_function() => N.build_GRU_model() => N.train_model() => N.predict() => N.evaluate_loss_function

Optimizer functions

def optimize(self):
    best_params, best_values = self.optimize_study()
    print(f"Best params: {best_params}\n Best value: {best_values}")
    return self


def objective_function(self, trial):
    units = trial.suggest_int('units', 10, 50)
            activation = trial.suggest_categorical("activation", ['relu', 'tanh', 'softsign'])
    dropout = trial.suggest_float('dropout', 0.01, 0.5)
    test_model = self.build_deep_GRU_model(trial)
    self.neural_network.train_model(test_model)
    y_true, y_pred = self.neural_network.predict(test_model)     
    return self.neural_network.evaluate_loss_function(y_true, y_pred)

NN functions

def build_GRU_model(self, hidden_neurons, activator, drop_out, OPTIMIZER = 'adam'):   
    keras.backend.clear_session()
    GRU_layer = keras.layers.GRU(hidden_neurons, dropout = drop_out, activation = activator)
    gru_model = keras.Sequential(layers = (GRU_layer, keras.layers.Dense(self.output_neurons)))
    gru_model.reset_states()
    gru_model.compile(optimizer = OPTIMIZER, loss = self.mae)
    return gru_model


def train_model(self, model, epoch_size = 150, BATCH_SIZE = BATCH_SIZE):
    early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 30, mode = 'min')
    history = model.fit(self.x_train, self.y_train, epochs = epoch_size, batch_size = BATCH_SIZE, callbacks = [early_stop], validation_data = (self.x_valid, self.y_valid), shuffle = False)
    print(model.summary())
    return history

The predict and loss evaluator functions in objective_function() simply undo the scaling and output the loss function for hyperparameter optimization. The exception is raised on the history = model.fit(...) line.

Solution

It turned out that I had multiple Jupyter kernel sessions on a remote server running in the background, taking up memory usage even when I would disconnect from the server. The network architecture and addition of joblib.dump() had nothing to do with this graph execution error. I verified that the problem did not replicate on my laptop GPU, and running an additional session on the server GPU led to OOM.