Search code examples
machine-learningdeep-learningpytorchmlflow

How to change the directory of mlflow logs?


I am using MLflow to log the metrics but I want to change the default saving logs directory. So, instead of writing log files besides my main file, I want to store them to /path/outputs/lg . I don't know how to change it. I use it without in the Model.

import os
from time import time

import mlflow
import numpy as np
import torch
import tqdm

# from segmentation_models_pytorch.utils import metrics
from AICore.emergency_landing.metrics import IoU, F1
from AICore.emergency_landing.utils import AverageMeter
from AICore.emergency_landing.utils import TBLogger


class Model:
    def __init__(self, model, num_classes=5, ignore_index=0, optimizer=None, scheduler=None, criterion=None,
                 device=None, epochs=30, train_loader=None, val_loader=None, tb_logger: TBLogger = None,
                 logger=None,
                 best_model_path=None,
                 model_check_point_path=None,
                 load_from_best_model=None,
                 load_from_model_checkpoint=None,
                 early_stopping=None,
                 debug=False):

        self.debug = debug

        self.early_stopping = {
            'init': early_stopping,
            'changed': 0
        }
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.device = device
        self.epochs = epochs
        self.train_loader = train_loader
        self.val_loader = val_loader

        self.model = model.to(device)

        self.tb_logger = tb_logger
        self.logger = logger

        self.best_loss = np.Inf

        if not os.path.exists(best_model_path):
            os.makedirs(best_model_path)
        self.best_model_path = best_model_path

        if not os.path.exists(model_check_point_path):
            os.makedirs(model_check_point_path)
        self.model_check_point_path = model_check_point_path

        self.load_from_best_model = load_from_best_model
        self.load_from_model_checkpoint = load_from_model_checkpoint

        if self.load_from_best_model is not None:
            self.load_model(path=self.load_from_best_model)
        if self.load_from_model_checkpoint is not None:
            self.load_model_checkpoint(path=self.load_from_model_checkpoint)

        self.train_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
        self.val_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)
        self.test_iou = IoU(num_classes=num_classes, ignore_index=ignore_index)

        self.train_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
        self.val_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')
        self.test_f1 = F1(num_classes=num_classes, ignore_index=ignore_index, mdmc_average='samplewise')

    def metrics(self, is_train=True):
        if is_train:
            train_losses = AverageMeter('Training Loss', ':.4e')
            train_iou = AverageMeter('Training iou', ':6.2f')
            train_f_score = AverageMeter('Training F_score', ':6.2f')

            return train_losses, train_iou, train_f_score
        else:
            val_losses = AverageMeter('Validation Loss', ':.4e')
            val_iou = AverageMeter('Validation mean iou', ':6.2f')
            val_f_score = AverageMeter('Validation F_score', ':6.2f')

            return val_losses, val_iou, val_f_score

    def fit(self):

        self.logger.info("\nStart training\n\n")
        start_training_time = time()

        with mlflow.start_run():
            for e in range(self.epochs):
                start_training_epoch_time = time()
                self.model.train()
                train_losses_avg, train_iou_avg, train_f_score_avg = self.metrics(is_train=True)
                with tqdm.tqdm(self.train_loader, unit="batch") as tepoch:
                    tepoch.set_description(f"Epoch {e}")
                    for image, target in tepoch:
                        # Transfer Data to GPU if available
                        image = image.to(self.device)
                        target = target.to(self.device)
                        # Clear the gradients
                        self.optimizer.zero_grad()
                        # Forward Pass
                        # out = self.model(image)['out']
                        # if unet == true => remove ['out']
                        out = self.model(image)
                        # Find the Loss
                        loss = self.criterion(out, target)
                        # Calculate Loss
                        train_losses_avg.update(loss.item(), image.size(0))
                        # Calculate gradients
                        loss.backward()
                        # Update Weights
                        self.optimizer.step()

                        iou = self.train_iou(out.cpu(), target.cpu()).item()
                        train_iou_avg.update(iou)

                        f1_score = self.train_f1(out.cpu(), target.cpu()).item()
                        train_f_score_avg.update(f1_score)

                        tepoch.set_postfix(loss=train_losses_avg.avg,
                                           iou=train_iou_avg.avg,
                                           f_score=train_f_score_avg.avg)
                        if self.debug:
                            break

                self.tb_logger.log(log_type='criterion/training', value=train_losses_avg.avg, epoch=e)
                self.tb_logger.log(log_type='iou/training', value=train_iou_avg.avg, epoch=e)
                self.tb_logger.log(log_type='f_score/training', value=train_f_score_avg.avg, epoch=e)

                mlflow.log_metric('criterion/training', train_losses_avg.avg, step=e)
                mlflow.log_metric('iou/training', train_iou_avg.avg, step=e)
                mlflow.log_metric('f_score/training', train_f_score_avg.avg, step=e)

                end_training_epoch_time = time() - start_training_epoch_time
                print('\n')
                self.logger.info(
                    f'Training Results - [{end_training_epoch_time:.3f}s] Epoch: {e}:'
                    f' f_score: {train_f_score_avg.avg:.3f},'
                    f' IoU: {train_iou_avg.avg:.3f},'
                    f' Loss: {train_losses_avg.avg:.3f}')

                # validation step
                val_loss = self.evaluation(e)
                # apply scheduler
                if self.scheduler:
                    self.scheduler.step()
                # early stopping
                if self.early_stopping['init'] >= self.early_stopping['changed']:
                    self._early_stopping_model(val_loss=val_loss)
                else:
                    print(f'The model can not learn more, Early Stopping at epoch[{e}]')
                    break

                # save best model
                if self.best_model_path is not None:
                    self._best_model(val_loss=val_loss, path=self.best_model_path)

                # model check points
                if self.model_check_point_path is not None:
                    self.save_model_check_points(path=self.model_check_point_path, epoch=e, net=self.model,
                                                 optimizer=self.optimizer, loss=self.criterion,
                                                 avg_loss=train_losses_avg.avg)

                # log mlflow
                if self.scheduler:
                    mlflow.log_param("get_last_lr", self.scheduler.get_last_lr())
                    mlflow.log_param("scheduler", self.scheduler.state_dict())

                self.tb_logger.flush()
                if self.debug:
                    break

            end_training_time = time() - start_training_time
            print(f'Finished Training after {end_training_time:.3f}s')
            self.tb_logger.close()

    def evaluation(self, epoch):
        print('Validating...')
        start_validation_epoch_time = time()
        self.model.eval()  # Optional when not using Model Specific layer
        with torch.no_grad():
            val_losses_avg, val_iou_avg, val_f_score_avg = self.metrics(is_train=False)
            with tqdm.tqdm(self.val_loader, unit="batch") as tepoch:
                for image, target in tepoch:
                    # Transfer Data to GPU if available
                    image = image.to(self.device)
                    target = target.to(self.device)
                    # out = self.model(image)['out']
                    # if unet == true => remove ['out']
                    out = self.model(image)
                    # Find the Loss
                    loss = self.criterion(out, target)
                    # Calculate Loss
                    val_losses_avg.update(loss.item(), image.size(0))

                    iou = self.val_iou(out.cpu(), target.cpu()).item()
                    val_iou_avg.update(iou)

                    f1_score = self.val_f1(out.cpu(), target.cpu()).item()
                    val_f_score_avg.update(f1_score)

                    tepoch.set_postfix(loss=val_losses_avg.avg,
                                       iou=val_iou_avg.avg,
                                       f_score=val_f_score_avg.avg)
                    if self.debug:
                        break
            print('\n')
            self.tb_logger.log(log_type='criterion/validation', value=val_losses_avg.avg, epoch=epoch)
            self.tb_logger.log(log_type='iou/validation', value=val_iou_avg.avg, epoch=epoch)
            self.tb_logger.log(log_type='f_score/validation', value=val_f_score_avg.avg, epoch=epoch)

            mlflow.log_metric('criterion/validation', val_losses_avg.avg, step=epoch)
            mlflow.log_metric('iou/validation', val_iou_avg.avg, step=epoch)
            mlflow.log_metric('f_score/validation', val_f_score_avg.avg, step=epoch)

            end_validation_epoch_time = time() - start_validation_epoch_time
            self.logger.info(
                f'validation Results - [{end_validation_epoch_time:.3f}s] Epoch: {epoch}:'
                f' f_score: {val_f_score_avg.avg:.3f},'
                f' IoU: {val_iou_avg.avg:.3f},'
                f' Loss: {val_losses_avg.avg:.3f}')
            print('\n')
            return val_losses_avg.avg

    def _save_model(self, name, path, params):
        torch.save(params, path)

    def _early_stopping_model(self, val_loss):
        if self.best_loss < val_loss:
            self.early_stopping['changed'] += 1
        else:
            self.early_stopping['changed'] = 0

    def _best_model(self, val_loss, path):
        if self.best_loss > val_loss:
            self.best_loss = val_loss
            name = f'/best_model_loss_{self.best_loss:.2f}'.replace('.', '_')
            self._save_model(name, path=f'{path}/{name}.pt', params={
                'model_state_dict': self.model.state_dict(),
            })

            print(f'The best model is saved with criterion: {self.best_loss:.2f}')

    def save_model_check_points(self, path, epoch, net, optimizer, loss, avg_loss):
        name = f'/model_epoch_{epoch}_loss_{avg_loss:.2f}'.replace('.', '_')
        self._save_model(name, path=f'{path}/{name}.pt', params={
            'epoch': epoch,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'criterion': loss,
        })
        print(f'model checkpoint is saved at model_epoch_{epoch}_loss_{avg_loss:.2f}')

    def load_model_checkpoint(self, path):
        checkpoint = torch.load(path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        self.criterion = checkpoint['criterion']

        return epoch

    def load_model(self, path):
        best_model = torch.load(path)
        self.model.load_state_dict(best_model['model_state_dict'])

Solution

  • The solution is:

    mlflow.set_tracking_uri(uri=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
    exp = mlflow.get_experiment_by_name(name='Emegency_landing')
    if not exp:
        experiment_id = mlflow.create_experiment(name='Emegency_landing',
                                                     artifact_location=f'file://{hydra.utils.to_absolute_path("../output/mlruns")}')
    else:
        experiment_id = exp.experiment_id
    

    And then you should pass the experiment Id to:

    with mlflow.start_run(experiment_id=experiment_id):
         pass 
    

    If you don't mention the /path/mlruns, when you run the command of mlflow ui, it will create another folder automatically named mlruns. so, pay attention to this point to have the same name as mlruns.