python json tensorflow scikit-learn slurm

Saving model hyperparameters as dictionary: json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

I am tuning some sklearn and tensorflow models, I get the hyperparameters as a dictionary, e.g., {'l1': 0.001, 'lr':0.001} and I save them in a larger dictionary, within my tuning function:

def optunize_hyperparameters(X_tr: Union[List[np.ndarray], pd.DataFrame, np.ndarray], y_tr: Union[dict, pd.DataFrame, np.ndarray],
                            Objective: Callable, builder_func: Callable, model_name: str, fit_params: dict, log_path: str, n_trials: int=10, **kwargs) -> dict:


    """
    Optimizes a model's hyperparameters with grid search. Loads existing ones if they exist.

    Parameters
    ----------
    X: 
        Training features.
    y: 
        Training targets
    Objective:
        Optuna objective callable class.
    model_name:
        In the format of "<model name>_qx" for x in {5,25,50,75,95}.
    hp_grid:
        Parameter grid for grid search
    fit_params:
        Parameters to pass to model.fit()
    log_path:
        Path to hyperparameter log file, e.g., "tuning_log.txt"

    Returns
    -------
    best_hps:
        A dictionary that can be passed as **kwargs to builder_func.
    """
    
    # Check if log exists, create log file if not
    if not os.path.exists(log_path):
        with open(log_path, 'w') as f:
            log = {}
            json.dump(log, f)

    # Load log
    try:
        with open(log_path, 'r') as f:
            log = json.load(f)
            print("Successfully loaded existing hyperparameters.")
    except OSError as e: 
            print(e)

    # Look for existing hps, optimize otherwise
    try:
        best_hps = log[model_name]
        print("Existing hyperparameters found, loading...")
    
    except:

        print("No existing hyperparameters found, optimizing hyperparameters...")
        study = optuna.create_study(
            sampler=optuna.samplers.RandomSampler(),
            pruner=optuna.pruners.SuccessiveHalvingPruner(),
            direction='maximize'
        )

        study.optimize(
            Objective(
                X_tr, y_tr,
                builder_func=builder_func,
                fit_params=fit_params,
                **kwargs
            ),
            n_trials=n_trials,
            n_jobs=-1
        )
        
        best_hps = study.best_params
        
        # Add hps to log and save it
        log[model_name] = best_hps

        with open(log_path, 'w') as f:
            json.dump(log,  f)
        
    return best_hps

I am submitting several jobs in parallel (32 or so) on Compute Canada using slurm. I seem to get json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) randomly, it is not consistent for any particular job or any particular model. I cannot replicate this issue locally so I don't even know how to debug it. Changing the file from .json to .txt seemed to help a little. In the past, I have also gotten the decode error for other lines in the file, so it may not necessarily be an issue with the first character, though the first character issue does seem to be the most common. I have looked at tuning_log.txt and it seems fine:

{"MT1_2012": {"l1": 0.01, "lr": 0.01}, "MT3_1997": {"l1": 1e-06, "lr": 0.01}, ...}

I have looked through other posts on SO about this issue, but they all pertain to fetching web data and the issues do not seem to be the same as mine (which is seemingly random and inconsistent).

Solution

The reason this was happening is because I was running multiple jobs in parallel that were trying to open and write to the same file. I solved the issue by using file locking. I modified my function as so (I need to update my doc string)

def optunize_hyperparameters(X_tr: Union[List[np.ndarray], pd.DataFrame, np.ndarray], y_tr: Union[dict, pd.DataFrame, np.ndarray],
                            Objective: Callable, builder_func: Callable, model_name: str, fit_params: dict, log_path: str, n_trials: int=10, **kwargs) -> dict:


    """
    Optimizes a model's hyperparameters with grid search. Loads existing ones if they exist.

    Parameters
    ----------
    X: 
        Training features.
    y: 
        Training targets
    Objective:
        Optuna objective callable class.
    model_name:
        In the format of "<model name>_qx" for x in {5,25,50,75,95}.
    hp_grid:
        Parameter grid for grid search
    fit_params:
        Parameters to pass to model.fit()
    log_path:
        Path to hyperparameter log file, e.g., "tuning_log.txt"

    Returns
    -------
    best_hps:
        A dictionary that can be passed as **kwargs to builder_func.
    """
    
    # Check if log exists, create log file if not
    if not os.path.exists(log_path):
        with open(log_path, 'w') as f:
            portalocker.lock(f, portalocker.LOCK_EX)
            log = {}
            json.dump(log, f)
            portalocker.unlock(f)

    # Load log
    try:
        with open(log_path, 'r+') as f:
            portalocker.lock(f, portalocker.LOCK_EX)
            log = json.load(f)
            # Perform read/write operations
            portalocker.unlock(f)
    except json.JSONDecodeError as e:
        raise e
    

    # Look for existing hps, optimize otherwise
    try:
        best_hps = log[model_name]
        print("Existing hyperparameters found, loading...")
    
    except:

        print("No existing hyperparameters found, optimizing hyperparameters...")
        study = optuna.create_study(
            sampler=optuna.samplers.RandomSampler(),
            pruner=optuna.pruners.SuccessiveHalvingPruner(),
            direction='maximize'
        )

        study.optimize(
            Objective(
                X_tr, y_tr,
                builder_func=builder_func,
                fit_params=fit_params,
                **kwargs
            ),
            n_trials=n_trials,
            n_jobs=-1
        )
        
        best_hps = study.best_params
        
        # Add hps to log and save it
        log[model_name] = best_hps

        with open(log_path, 'r+') as f:
            portalocker.lock(f, portalocker.LOCK_EX)
            log[model_name] = best_hps
            f.seek(0)
            json.dump(log, f)
            f.truncate()
            portalocker.unlock(f)
            
    return best_hps