base trainer code

Hyunji
Commit 32c28396b6ce7d7c3fd3d818e705cf8e19f0de0c 32c28396 1 parent dab1b90c
Showing 1 changed file with 342 additions and 0 deletions
lib/base_trainer.py
--- a/lib/base_trainer.py 0 → 100644
View file @32c2839
+++ b/lib/base_trainer.py 0 → 100644
View file @32c2839
+"""trainer code"""
+import copy
+import logging
+import os
+from typing import List, Dict, Optional, Callable, Union
+
+import dill
+import numpy as np
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+from lib.utils.logging import loss_logger_helper
+
+logger = logging.getLogger()
+
+
+class Trainer:
+    # This is like skorch but instead of callbacks we use class functions (looks less magic)
+    # this is an evolving template
+    def __init__(
+            self,
+            model: torch.nn.Module,
+            optimizer: torch.optim,
+            scheduler: torch.optim.lr_scheduler,
+            result_dir: Optional[str],
+            statefile: Optional[str] = None,
+            log_every: int = 100,
+            save_strategy: Optional[List] = None,
+            patience: int = 20,
+            max_epoch: int = 100,
+            gradient_norm_clip=-1,
+            stopping_criteria_direction: str = "bigger",
+            stopping_criteria: Optional[Union[str, Callable]] = "accuracy",
+            evaluations=None,
+            **kwargs,
+    ):
+        """
+            stopping_criteria : can be a function, string or none. If string it should match one
+            of the keys in aux_loss or should be loss, if none we don't invoke early stopping
+        """
+        super().__init__()
+
+        self.result_dir = result_dir
+        self.model = model
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+        self.evaluations = evaluations
+        self.gradient_norm_clip = gradient_norm_clip
+
+        # training state related params
+        self.epoch = 0
+        self.step = 0
+        self.best_criteria = None
+        self.best_epoch = -1
+
+        # config related param
+        self.log_every = log_every
+        self.save_strategy = save_strategy
+        self.patience = patience
+        self.max_epoch = max_epoch
+        self.stopping_criteria_direction = stopping_criteria_direction
+        self.stopping_criteria = stopping_criteria
+
+        # TODO: should save config and see if things have changed?
+        if statefile is not None:
+            self.load(statefile)
+
+        # init best model
+        self.best_model = self.model.state_dict()
+
+        # logging stuff
+        if result_dir is not None:
+            # we do not need to purge. Purging can delete the validation result
+            self.summary_writer = SummaryWriter(log_dir=result_dir)
+
+    def load(self, fname: str) -> Dict:
+        """
+            fname: file name to load data from
+        """
+
+        data = torch.load(open(fname, "rb"), pickle_module=dill, map_location=self.model.device)
+
+        if getattr(self, "model", None) and data.get("model") is not None:
+            state_dict = self.model.state_dict()
+            state_dict.update(data["model"])
+            self.model.load_state_dict(state_dict)
+
+        if getattr(self, "optimizer", None) and data.get("optimizer") is not None:
+            optimizer_dict = self.optimizer.state_dict()
+            optimizer_dict.update(data["optimizer"])
+            self.optimizer.load_state_dict(optimizer_dict)
+
+        if getattr(self, "scheduler", None) and data.get("scheduler") is not None:
+            scheduler_dict = self.scheduler.state_dict()
+            scheduler_dict.update(data["scheduler"])
+            self.scheduler.load_state_dict(scheduler_dict)
+
+        self.epoch = data["epoch"]
+        self.step = data["step"]
+        self.best_criteria = data["best_criteria"]
+        self.best_epoch = data["best_epoch"]
+        return data
+
+    def save(self, fname: str, **kwargs):
+        """
+        fname: file name to save to
+        kwargs: more arguments that we may want to save.
+
+        By default we
+            - save,
+            - model,
+            - optimizer,
+            - epoch,
+            - step,
+            - best_criteria,
+            - best_epoch
+        """
+        # NOTE: Best model is maintained but is saved automatically depending on save strategy,
+        # So that It could be loaded outside of the training process
+        kwargs.update({
+                "model"        : self.model.state_dict(),
+                "optimizer"    : self.optimizer.state_dict(),
+                "epoch"        : self.epoch,
+                "step"         : self.step,
+                "best_criteria": self.best_criteria,
+                "best_epoch"   : self.best_epoch,
+        })
+
+        if self.scheduler is not None:
+            kwargs.update({"scheduler": self.scheduler.state_dict()})
+
+        torch.save(kwargs, open(fname, "wb"), pickle_module=dill)
+
+    # todo : allow to extract predictions
+    def run_iteration(self, batch, training: bool = True, reduce: bool = True):
+        """
+            batch : batch of data, directly passed to model as is
+            training: if training set to true else false
+            reduce: whether to compute loss mean or return the raw vector form
+        """
+        pred = self.model(batch)
+        loss, aux_loss = self.model.loss(pred, batch, reduce=reduce)
+        print(pred)
+
+        if training:
+            print(pred)
+            loss.backward()
+            if self.gradient_norm_clip > 0:
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.gradient_norm_clip)
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+
+        return loss, aux_loss
+
+    def compute_criteria(self, loss, aux_loss):
+        stopping_criteria = self.stopping_criteria
+        if stopping_criteria is None:
+            return loss
+
+        if callable(stopping_criteria):
+            return stopping_criteria(loss, aux_loss)
+
+        if stopping_criteria == "loss":
+            return loss
+
+        if aux_loss.get(stopping_criteria) is not None:
+            return aux_loss[stopping_criteria]
+
+        raise Exception(f"{stopping_criteria} not found")
+
+    def train_batch(self, batch, *args, **kwargs):
+        # This trains the batch
+        loss, aux_loss = self.run_iteration(batch, training=True, reduce=True)
+        loss_logger_helper(loss, aux_loss, writer=self.summary_writer, step=self.step,
+                           epoch=self.epoch,
+                           log_every=self.log_every, string="train")
+
+    def train_epoch(self, train_loader, *args, **kwargs):
+        # This trains the epoch and also calls on batch begin and on batch end
+        # before and after calling train_batch respectively
+        self.model.train()
+        for i, batch in enumerate(train_loader):
+            self.on_batch_begin(i, batch, *args, **kwargs)
+            self.train_batch(batch, *args, **kwargs)
+            self.on_batch_end(i, batch, *args, **kwargs)
+            self.step += 1
+        self.model.eval()
+
+    def on_train_begin(self, train_loader, valid_loader, *args, **kwargs):
+        # this could be used to add things to class object like scheduler etc
+        if "init" in self.save_strategy:
+            if self.epoch == 0:
+                self.save(f"{self.result_dir}/init_model.pt")
+
+    def on_epoch_begin(self, train_loader, valid_loader, *args, **kwargs):
+        # This is called when epoch begins
+        pass
+
+    def on_batch_begin(self, epoch_step, batch, *args, **kwargs):
+        # This is called when batch begins
+        pass
+
+    def on_train_end(self, train_loader, valid_loader, *args, **kwargs):
+        # Called when training finishes. For base trainer we just save the last model
+        if "last" in self.save_strategy:
+            logger.info("Saving the last model")
+            self.save(f"{self.result_dir}/last_model.pt")
+
+    def on_epoch_end(self, train_loader, valid_loader, *args, **kwargs):
+        # called when epoch ends
+        # we call validation, scheduler here
+        # also check if we have a new best model and save model if needed
+
+        # call validate
+        loss, aux_loss = self.validate(train_loader, valid_loader, *args, **kwargs)
+        loss_logger_helper(loss, aux_loss, writer=self.summary_writer, step=self.step,
+                           epoch=self.epoch, log_every=self.log_every, string="val",
+                           force_print=True)
+
+        # do scheduler step
+        if self.scheduler is not None:
+            prev_lr = [group['lr'] for group in self.optimizer.param_groups]
+            if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                criteria = self.compute_criteria(loss, aux_loss)
+                self.scheduler.step(criteria)
+            else:
+                self.scheduler.step()
+            new_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+        # if you don't pass a criteria, it won't be computed and best model won't be saved.
+        # on the contrary if you pass a stopping criteria, best model would be saved.
+        # You can pass a large patience to get rid of early stopping
+        if self.stopping_criteria is not None:
+            criteria = self.compute_criteria(loss, aux_loss)
+
+            if (
+                    (self.best_criteria is None)
+                    or (
+                    self.stopping_criteria_direction == "bigger" and self.best_criteria < criteria)
+                    or (
+                    self.stopping_criteria_direction == "lower" and self.best_criteria > criteria)
+            ):
+                self.best_criteria = criteria
+                self.best_epoch = self.epoch
+                self.best_model = copy.deepcopy(
+                    {k: v.cpu() for k, v in self.model.state_dict().items()})
+
+                if "best" in self.save_strategy:
+                    logger.info(f"Saving best model at epoch {self.epoch}")
+                    self.save(f"{self.result_dir}/best_model.pt")
+
+        if "epoch" in self.save_strategy:
+            logger.info(f"Saving model at epoch {self.epoch}")
+            self.save(f"{self.result_dir}/{self.epoch}_model.pt")
+
+        if "current" in self.save_strategy:
+            logger.info(f"Saving model at epoch {self.epoch}")
+            self.save(f"{self.result_dir}/current_model.pt")
+
+        # logic to load best model on reduce lr
+        if self.scheduler is not None and not (all(a == b for (a, b) in zip(prev_lr, new_lr))):
+            if getattr(self.scheduler, 'load_on_reduce', None) == "best":
+                logger.info(f"Loading best model at epoch {self.epoch}")
+                # we want to preserve the scheduler
+                old_lrs = list(map(lambda x: x['lr'], self.optimizer.param_groups))
+                old_scheduler_dict = copy.deepcopy(self.scheduler.state_dict())
+
+                best_model_path = None
+                if os.path.exists(f"{self.result_dir}/best_model.pt"):
+                    best_model_path = f"{self.result_dir}/best_model.pt"
+                else:
+                    d = "/".join(self.result_dir.split("/")[:-1])
+                    for directory in os.listdir(d):
+                        if os.path.exists(f"{d}/{directory}/best_model.pt"):
+                            best_model_path = self.load(f"{d}/{directory}/best_model.pt")
+
+                if best_model_path is None:
+                    raise FileNotFoundError(
+                        f"Best Model not found in {self.result_dir}, please copy if it exists in "
+                        f"other folder")
+
+                self.load(best_model_path)
+                # override scheduler to keep old one and also keep reduced learning rates
+                self.scheduler.load_state_dict(old_scheduler_dict)
+                for idx, lr in enumerate(old_lrs):
+                    self.optimizer.param_groups[idx]['lr'] = lr
+                logger.info(f"loaded best model and restarting from end of {self.epoch}")
+
+    def on_batch_end(self, epoch_step, batch, *args, **kwargs):
+        # called after a batch is trained
+        pass
+
+    def train(self, train_loader, valid_loader, *args, **kwargs):
+
+        self.on_train_begin(train_loader, valid_loader, *args, **kwargs)
+        while self.epoch < self.max_epoch:
+            # NOTE: +1 here is more convenient, as now we don't need to do +1 before saving model
+            # If we don't do +1 before saving model, we will have to redo the last epoch
+            # So +1 here makes life easy, if we load model at end of e epoch, we will load model
+            # and start with e+1... smooth
+            self.epoch += 1
+            self.on_epoch_begin(train_loader, valid_loader, *args, **kwargs)
+            logger.info(f"Starting epoch {self.epoch}")
+            self.train_epoch(train_loader, *args, **kwargs)
+            self.on_epoch_end(train_loader, valid_loader, *args, **kwargs)
+
+            if self.epoch - self.best_epoch > self.patience:
+                logger.info(f"Patience reached stopping training after {self.epoch} epochs")
+                break
+
+        self.on_train_end(train_loader, valid_loader, *args, **kwargs)
+
+    def validate(self, train_loader, valid_loader, *args, **kwargs):
+        """
+        we expect validate to return mean and other aux losses that we want to log
+        """
+        losses = []
+        aux_losses = {}
+
+        self.model.eval()
+        with torch.no_grad():
+            for i, batch in enumerate(valid_loader):
+                loss, aux_loss = self.run_iteration(batch, training=False, reduce=False)
+                losses.extend(loss.cpu().tolist())
+
+                if i == 0:
+                    for k, v in aux_loss.items():
+                        # when we can't return sample wise statistics, we need to do this
+                        if len(v.shape) == 0:
+                            aux_losses[k] = [v.cpu().tolist()]
+                        else:
+                            aux_losses[k] = v.cpu().tolist()
+                else:
+                    for k, v in aux_loss.items():
+                        if len(v.shape) == 0:
+                            aux_losses[k].append(v.cpu().tolist())
+                        else:
+                            aux_losses[k].extend(v.cpu().tolist())
+        return np.mean(losses), {k: np.mean(v) for (k, v) in aux_losses.items()}
+
+    def test(self, train_loader, test_loader, *args, **kwargs):
+        return self.validate(train_loader, test_loader, *args, **kwargs)