Source code for argus.callbacks.checkpoints

"""Callbacks for argus model saving.
"""
import os
import math
import warnings

from argus.engine import State
from argus.callbacks.callback import Callback
from argus.metrics.metric import init_better


[docs]class Checkpoint(Callback):
    """Save the model with a given period.

    In the simplest case, the callback can be used to save the model after
    each epoch.

    Args:
        dir_path (str, optional): Directory to save checkpoints. The
            desired directory will be created if it does not exist.
            Defaults to ''.
        file_format (str, optional): Model saving filename format. Any
            valid value names from the model State may be used. Defaults to
            'model-{epoch:03d}-{train_loss:.6f}.pth'.
        max_saves (int, optional): Number of last saved models to keep.
            Should be positive. If None - save all models. Defaults to
            None.
        period (int, optional): Interval (number of epochs) between
            checkpoint saves. Defaults to 1.
        save_after_exception (bool, optional): Save the model checkpoint
            after an exception occurs. Defaults to False.

    """

    def __init__(self,
                 dir_path='',
                 file_format='model-{epoch:03d}-{train_loss:.6f}.pth',
                 max_saves=None,
                 period=1,
                 save_after_exception=False):
        if not (max_saves is None or max_saves > 0):
            raise ValueError("max_saves should be positive or 'None'")

        self.dir_path = dir_path
        self.file_format = file_format
        self.max_saves = max_saves
        self.saved_files_paths = []
        if self.dir_path:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            else:
                warnings.warn(f"Directory '{dir_path}' already exists")
        self.period = period
        self.save_after_exception = save_after_exception
        self.epochs_since_last_save = 0

[docs]    def save_model(self, state: State, file_path):
        """Save model to file.

        Override the method if you need custom checkpoint saving.

        Args:
            state (:class:`argus.engine.State`): State.
            file_path (str): Checkpoint file path.
        """
        state.model.save(file_path)

    def _format_file_path(self, state: State):
        format_state = {'epoch': state.epoch, **state.metrics}
        file_name = self.file_format.format(**format_state)
        file_path = os.path.join(self.dir_path, file_name)
        return file_path

    def start(self, state: State):
        self.epochs_since_last_save = 0
        self.saved_files_paths = []

    def save_checkpoint(self, state: State):
        self.epochs_since_last_save += 1
        if self.epochs_since_last_save >= self.period:
            self.epochs_since_last_save = 0

            file_path = self._format_file_path(state)
            self.save_model(state, file_path)
            self.saved_files_paths.append(file_path)

            if self.max_saves is not None:
                if len(self.saved_files_paths) > self.max_saves:
                    old_file_path = self.saved_files_paths.pop(0)
                    if os.path.exists(old_file_path):
                        os.remove(old_file_path)
                        state.logger.info(f"Model removed '{old_file_path}'")

    def epoch_complete(self, state: State):
        self.save_checkpoint(state)

    def catch_exception(self, state: State):
        if self.save_after_exception:
            exception_model_path = os.path.join(self.dir_path,
                                                'model-after-exception.pth')
            self.save_model(state, exception_model_path)


[docs]class MonitorCheckpoint(Checkpoint):
    """Save the model checkpoints after a metric is improved.

    The MonitorCheckpoint augments the simple Checkpoint with a metric
    monitoring. It saves the model after the defined metric is improved. It
    is possible to monitor loss values during training as well as any
    metric available in the model State.

    Args:
        dir_path (str, optional): Directory to save checkpoints. The
            desired directory will be created if it does not exist.
            Defaults to ''.
        file_format (str, optional): Model saving filename format. Any
            valid value names from the model State may be used. Defaults to
            'model-{epoch:03d}-{monitor:.6f}.pth'.
        max_saves ([type], optional): Number of last saved models to keep.
            Should be positive. If None - save all models. Defaults to
            None.
        save_after_exception (bool, optional): Save the model checkpoint
            after an exception occurs. Defaults to False.
        monitor (str, optional): Metric name to monitor. It should be
            prepended with *val_* for the metric value on validation data
            and *train_* for the metric value on the date from the train
            loader. A val_loader should be provided during the model fit to
            make it possible to monitor metrics start with *val_*.
            Defaults to *val_loss*.
        better (str, optional): The metric improvement criterion. Should be
            'min', 'max' or 'auto'. 'auto' means the criterion should be
            taken from the metric itself, which is appropriate behavior in
            most cases. Defaults to 'auto'.

    """

    def __init__(self,
                 dir_path='',
                 file_format='model-{epoch:03d}-{monitor:.6f}.pth',
                 max_saves=None,
                 save_after_exception=False,
                 monitor='val_loss',
                 better='auto'):
        if not monitor.startswith('val_') and not monitor.startswith('train_'):
            raise ValueError("monitor should be prepended with 'val_' or 'train_'")

        super().__init__(dir_path=dir_path,
                         file_format=file_format,
                         max_saves=max_saves,
                         period=1,
                         save_after_exception=save_after_exception)
        self.monitor = monitor
        self.better, self.better_comp, self.best_value = init_better(
            better, monitor)

    def _format_file_path(self, state: State):
        format_state = {'epoch': state.epoch,
                        'monitor': state.metrics[self.monitor],
                        **state.metrics}
        file_name = self.file_format.format(**format_state)
        file_path = os.path.join(self.dir_path, file_name)
        return file_path

    def start(self, state: State):
        self.best_value = math.inf if self.better == 'min' else -math.inf

    def epoch_complete(self, state: State):
        if self.monitor not in state.metrics:
            raise ValueError(f"Monitor '{self.monitor}' metric not found in state")
        current_value = state.metrics[self.monitor]
        if self.better_comp(current_value, self.best_value):
            self.best_value = current_value
            self.save_checkpoint(state)