(refactor) black style

graykode
Commit 9b9ed4f689ae444f6de6f5748588aa20a94cf721 9b9ed4f6 1 parent fef8c9aa
Showing 11 changed files with 1028 additions and 381 deletions
commit_suggester.py
preprocess/__init__.py
preprocess/gitcommit.py
train/__init__.py
train/callbacks.py
train/finetune.py
train/generation_utils.py
train/lightning_base.py
train/modeling_bart.py
train/modeling_utils.py
train/utils.py
--- a/commit_suggester.py
View file @9b9ed4f
+++ b/commit_suggester.py
View file @9b9ed4f
@@ -68,6 +68,7 @@ def main(args):
     )
     print(commit_message)
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Code to collect commits on github")
     parser.add_argument(
--- a/preprocess/__init__.py
View file @9b9ed4f
+++ b/preprocess/__init__.py
View file @9b9ed4f
 # Copyright 2020-present Tae Hwan Jung
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,6 +15,6 @@
 from .gitcommit import diff_parse, truncate
 __all__ = [
-    'diff_parse',
-    'truncate',
-]
\ No newline at end of file
+    "diff_parse",
+    "truncate",
+]
--- a/preprocess/gitcommit.py
View file @9b9ed4f
+++ b/preprocess/gitcommit.py
View file @9b9ed4f
@@ -36,9 +36,11 @@ logging.basicConfig(
     level=logging.INFO,
 )
+
 class PATCH(enum.Enum):
-	PLUS=1
+    PLUS = 1
-	MINUS=2
+    MINUS = 2
+
 def truncate(tuple, max_length, value=0):
     ls = []
@@ -46,22 +48,20 @@ def truncate(tuple, max_length, value=0):
         if isinstance(t, int):
             t = [t]
         ls.extend(t)
-    ls = ls[:max_length - 1]
+    ls = ls[: max_length - 1]
     ls.insert(0, value)
     if len(ls) < max_length:
         ls.extend([0] * (max_length - len(ls)))
     assert len(ls) == max_length
     return ls
+
 def encode_line(tokenizer, line, patch):
-    line = re.sub(r'[\u0100-\uFFFF\U00010000-\U0010FFFF]+', '', line).strip()
+    line = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", line).strip()
     tokens = tokenizer.tokenize(line)
     tokens = tokenizer.convert_tokens_to_ids(tokens)
-    return (
+    return (tokens, [1] * len(tokens), len(tokens) * [patch.value])
-        tokens,
+
-        [1] * len(tokens),
-        len(tokens) * [patch.value]
-    )
 def diff_parse(diff, tokenizer):
     chunks = []
@@ -78,6 +78,7 @@ def diff_parse(diff, tokenizer):
                 chunks.append(encode_line(tokenizer, change.line, PATCH.MINUS))
     return chunks
+
 def sha_parse(sha, tokenizer, max_length=1024):
     chunks = diff_parse(diff=repo.git.show(sha), tokenizer=tokenizer)
@@ -91,16 +92,18 @@ def sha_parse(sha, tokenizer, max_length=1024):
     return (input_ids, attention_masks, patch_ids)
+
 def message_parse(msg, tokenizer, max_length=56):
-    msg = re.sub(r'(\(|)#([0-9])+(\)|)', '', msg)
+    msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
-    msg = re.sub(r'[\u0100-\uFFFF\U00010000-\U0010FFFF]+', '', msg).strip()
+    msg = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", msg).strip()
     msg = tokenizer.tokenize(msg)
     msg = tokenizer.convert_tokens_to_ids(msg)
     msg = truncate(msg, max_length, value=0)
     return msg
+
 def jobs(sha_msgs, args, data_config, train=True):
     input_ids, attention_masks, patch_ids, targets = [], [], [], []
@@ -110,9 +113,7 @@ def jobs(sha_msgs, args, data_config, train=True):
         sha, msg = sha_msg
         source = sha_parse(
-            sha,
+            sha, tokenizer=args.tokenizer, max_length=args.max_source_length
-            tokenizer=args.tokenizer,
-            max_length=args.max_source_length
         )
         if not source:
             continue
@@ -120,7 +121,9 @@ def jobs(sha_msgs, args, data_config, train=True):
         target = message_parse(
             msg,
             tokenizer=args.tokenizer,
-            max_length=(args.max_target_length if train else args.val_max_target_length),
+            max_length=(
+                args.max_target_length if train else args.val_max_target_length
+            ),
         )
         input_ids.append(input_id)
@@ -128,14 +131,17 @@ def jobs(sha_msgs, args, data_config, train=True):
         patch_ids.append(patch_id)
         targets.append(target)
-    data_saver({
+    data_saver(
-        "input_ids": np.asarray(input_ids),
+        {
-        "attention_masks": np.asarray(attention_masks),
+            "input_ids": np.asarray(input_ids),
-        "patch_ids": np.asarray(patch_ids),
+            "attention_masks": np.asarray(attention_masks),
-        "targets": np.asarray(targets),
+            "patch_ids": np.asarray(patch_ids),
-    })
+            "targets": np.asarray(targets),
+        }
+    )
     data_saver.disconnect()
+
 def start(chunked_sha_msgs, train=True):
     logger.info(f"Start %s pre-processing" % ("training" if train else "evaluation"))
@@ -144,22 +150,22 @@ def start(chunked_sha_msgs, train=True):
     data_config = DataConfig(
         endpoint=args.endpoint,
-        access_key=os.environ['access_key'],
+        access_key=os.environ["access_key"],
-        secret_key=os.environ['secret_key'],
+        secret_key=os.environ["secret_key"],
         region=args.region,
-        dataset_name='commit-autosuggestions',
+        dataset_name="commit-autosuggestions",
         additional={
-            "mode" : ("training" if train else "evaluation"),
+            "mode": ("training" if train else "evaluation"),
             "max_source_length": args.max_source_length,
             "max_target_length": max_target_length,
-            "url" : args.url,
+            "url": args.url,
         },
         attributes=[
-            ('input_ids', 'int32', (args.max_source_length,)),
+            ("input_ids", "int32", (args.max_source_length,)),
-            ('attention_masks', 'int32', (args.max_source_length,)),
+            ("attention_masks", "int32", (args.max_source_length,)),
-            ('patch_ids', 'int32', (args.max_source_length,)),
+            ("patch_ids", "int32", (args.max_source_length,)),
-            ('targets', 'int32', (max_target_length,))
+            ("targets", "int32", (max_target_length,)),
-        ]
+        ],
     )
     func = partial(jobs, args=args, data_config=data_config, train=train)
@@ -168,14 +174,15 @@ def start(chunked_sha_msgs, train=True):
             for i, _ in tqdm(enumerate(pool.imap_unordered(func, chunked_sha_msgs))):
                 pbar.update()
+
 def main(args):
-    if 'access_key' not in os.environ or 'secret_key' not in os.environ:
+    if "access_key" not in os.environ or "secret_key" not in os.environ:
         raise OSError("access_key or secret_key are not found.")
     sha_msgs = [(c.hexsha, c.summary) for c in repo.iter_commits()]
     random.shuffle(sha_msgs)
     chunked_sha_msgs = [
-        sha_msgs[x:x + args.matorage_batch]
+        sha_msgs[x : x + args.matorage_batch]
         for x in range(0, len(sha_msgs), args.matorage_batch)
     ]
@@ -185,29 +192,25 @@ def main(args):
     if args.do_predict:
         start(chunked_sha_msgs[barrier:], train=False)
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Code to collect commits on github")
-    parser.add_argument(
+    parser.add_argument("--url", type=str, required=True, help="github url")
-        "--url",
-        type=str,
-        required=True,
-        help="github url"
-    )
     parser.add_argument(
         "--endpoint",
         type=str,
         required=True,
-        help='matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html'
+        help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
     )
     parser.add_argument(
         "--region",
         type=str,
         default=None,
-        help='matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html'
+        help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
     )
     parser.add_argument(
         "--tokenizer_name",
-        default='sshleifer/distilbart-xsum-6-6',
+        default="sshleifer/distilbart-xsum-6-6",
         type=str,
         help="Pretrained tokenizer name or path if not the same as model_name",
     )
@@ -215,41 +218,40 @@ if __name__ == "__main__":
         "--matorage_batch",
         default=1024,
         type=int,
-        help='The smallest batch size stored atomically in matorage.'
+        help="The smallest batch size stored atomically in matorage.",
     )
     parser.add_argument(
-        "--num_workers",
+        "--num_workers", default=4, type=int, help="number of process",
-        default=4,
-        type=int,
-        help="number of process",
     )
     parser.add_argument(
         "--max_source_length",
         default=1024,
         type=int,
         help="The maximum total input sequence length after tokenization. Sequences longer "
-             "than this will be truncated, sequences shorter will be padded.",
+        "than this will be truncated, sequences shorter will be padded.",
     )
     parser.add_argument(
         "--max_target_length",
         default=56,
         type=int,
         help="The maximum total input sequence length after tokenization. Sequences longer "
-             "than this will be truncated, sequences shorter will be padded.",
+        "than this will be truncated, sequences shorter will be padded.",
     )
     parser.add_argument(
         "--val_max_target_length",
         default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
         type=int,
         help="The maximum total input sequence length after tokenization. Sequences longer "
-             "than this will be truncated, sequences shorter will be padded.",
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--p_val", type=float, default=0.25, help="percent of validation dataset"
     )
-    parser.add_argument("--p_val", type=float, default=0.25, help="percent of validation dataset")
     parser.add_argument("--do_train", action="store_true", default=False)
     parser.add_argument("--do_predict", action="store_true", default=False)
     args = parser.parse_args()
-    args.local_path = args.url.split('/')[-1]
+    args.local_path = args.url.split("/")[-1]
     logger.info(f"master branch of {args.url} will be downloaded to {args.local_path}")
     repo = (
         Repo(args.local_path)
--- a/train/__init__.py
View file @9b9ed4f
+++ b/train/__init__.py
View file @9b9ed4f
 # Copyright 2020-present Tae Hwan Jung
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,6 +14,4 @@
 from .modeling_bart import BartForConditionalGeneration
-__all__ = [
-    'BartForConditionalGeneration'
-]
\ No newline at end of file
+__all__ = ["BartForConditionalGeneration"]
--- a/train/callbacks.py
View file @9b9ed4f
+++ b/train/callbacks.py
View file @9b9ed4f
@@ -20,16 +20,31 @@ logger = logging.getLogger(__name__)
 class Seq2SeqLoggingCallback(pl.Callback):
     def on_batch_end(self, trainer, pl_module):
-        lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)}
+        lrs = {
+            f"lr_group_{i}": param["lr"]
+            for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)
+        }
         pl_module.logger.log_metrics(lrs)
     @rank_zero_only
     def _write_logs(
-        self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True
+        self,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        type_path: str,
+        save_generations=True,
     ) -> None:
-        logger.info(f"***** {type_path} results at step {trainer.global_step:05d} *****")
+        logger.info(
+            f"***** {type_path} results at step {trainer.global_step:05d} *****"
+        )
         metrics = trainer.callback_metrics
-        trainer.logger.log_metrics({k: v for k, v in metrics.items() if k not in ["log", "progress_bar", "preds"]})
+        trainer.logger.log_metrics(
+            {
+                k: v
+                for k, v in metrics.items()
+                if k not in ["log", "progress_bar", "preds"]
+            }
+        )
         # Log results
         od = Path(pl_module.hparams.output_dir)
         if type_path == "test":
@@ -39,7 +54,9 @@ class Seq2SeqLoggingCallback(pl.Callback):
             # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
             # If people want this it will be easy enough to add back.
             results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-            generations_file = od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
+            generations_file = (
+                od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
+            )
             results_file.parent.mkdir(exist_ok=True)
             generations_file.parent.mkdir(exist_ok=True)
         with open(results_file, "a+") as writer:
@@ -68,7 +85,9 @@ class Seq2SeqLoggingCallback(pl.Callback):
         n_trainable_pars = count_trainable_parameters(pl_module)
         # mp stands for million parameters
-        trainer.logger.log_metrics({"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6})
+        trainer.logger.log_metrics(
+            {"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6}
+        )
     @rank_zero_only
     def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
@@ -98,8 +117,5 @@ def get_checkpoint_callback(output_dir, metric):
 def get_early_stopping_callback(metric, patience):
     return EarlyStopping(
-        monitor=f"val_{metric}",
+        monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,
-        mode="max",
-        patience=patience,
-        verbose=True,
     )
--- a/train/finetune.py
View file @9b9ed4f
+++ b/train/finetune.py
View file @9b9ed4f
@@ -21,7 +21,11 @@ from matorage.torch import Dataset
 try:
-    from .callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
+    from .callbacks import (
+        Seq2SeqLoggingCallback,
+        get_checkpoint_callback,
+        get_early_stopping_callback,
+    )
     from .utils import (
         ROUGE_KEYS,
         LegacySeq2SeqDataset,
@@ -40,7 +44,11 @@ try:
         use_task_specific_params,
     )
 except ImportError:
-    from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
+    from callbacks import (
+        Seq2SeqLoggingCallback,
+        get_checkpoint_callback,
+        get_early_stopping_callback,
+    )
     from utils import (
         ROUGE_KEYS,
         LegacySeq2SeqDataset,
@@ -83,8 +91,12 @@ class SummarizationModule(BaseTransformer):
             "val": self.hparams.val_max_target_length,
             "test": self.hparams.test_max_target_length,
         }
-        assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}"
+        assert (
-        assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}"
+            self.target_lens["train"] <= self.target_lens["val"]
+        ), f"target_lens: {self.target_lens}"
+        assert (
+            self.target_lens["train"] <= self.target_lens["test"]
+        ), f"target_lens: {self.target_lens}"
         if self.hparams.freeze_embeds:
             self.freeze_embeds()
@@ -95,13 +107,27 @@ class SummarizationModule(BaseTransformer):
         self.hparams.git_sha = get_git_info()["repo_sha"]
         self.num_workers = hparams.num_workers
         self.decoder_start_token_id = None  # default to config
-        if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
+        if self.model.config.decoder_start_token_id is None and isinstance(
-            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
+            self.tokenizer, MBartTokenizer
+        ):
+            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[
+                hparams.tgt_lang
+            ]
             self.model.config.decoder_start_token_id = self.decoder_start_token_id
-        self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams
+        self.eval_beams = (
-        assert self.eval_beams >= 1, f"got self.eval_beams={self.eval_beams}. Need an integer > 1"
+            self.model.config.num_beams
-        self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
+            if self.hparams.eval_beams is None
+            else self.hparams.eval_beams
+        )
+        assert (
+            self.eval_beams >= 1
+        ), f"got self.eval_beams={self.eval_beams}. Need an integer > 1"
+        self.val_metric = (
+            self.default_val_metric
+            if self.hparams.val_metric is None
+            else self.hparams.val_metric
+        )
     def freeze_embeds(self):
         """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
@@ -133,7 +159,13 @@ class SummarizationModule(BaseTransformer):
         else:
             decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id)
-        outputs = self(src_ids, src_patch, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
+        outputs = self(
+            src_ids,
+            src_patch,
+            attention_mask=src_mask,
+            decoder_input_ids=decoder_input_ids,
+            use_cache=False,
+        )
         lm_logits = outputs[0]
         if self.hparams.label_smoothing == 0:
             # Same behavior as modeling_bart.py, besides ignoring pad_token_id
@@ -157,7 +189,9 @@ class SummarizationModule(BaseTransformer):
         logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
         # tokens per batch
-        logs["tpb"] = batch[0].long().ne(self.pad).sum() + batch[3].long().ne(self.pad).sum()
+        logs["tpb"] = (
+            batch[0].long().ne(self.pad).sum() + batch[3].long().ne(self.pad).sum()
+        )
         return {"loss": loss_tensors[0], "log": logs}
     def validation_step(self, batch, batch_idx) -> Dict:
@@ -165,17 +199,29 @@ class SummarizationModule(BaseTransformer):
     def validation_epoch_end(self, outputs, prefix="val") -> Dict:
         self.step_count += 1
-        losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
+        losses = {
+            k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names
+        }
         loss = losses["loss"]
-        rouges = {k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"]}
+        rouges = {
-        rouge_tensor: torch.FloatTensor = torch.tensor(rouges[self.val_metric]).type_as(loss)
+            k: np.array([x[k] for x in outputs]).mean()
+            for k in self.metric_names + ["gen_time", "gen_len"]
+        }
+        rouge_tensor: torch.FloatTensor = torch.tensor(rouges[self.val_metric]).type_as(
+            loss
+        )
         rouges.update({k: v.item() for k, v in losses.items()})
         losses.update(rouges)
         metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
         metrics["step_count"] = self.step_count
         self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
         preds = flatten_list([x["preds"] for x in outputs])
-        return {"log": metrics, "preds": preds, f"{prefix}_loss": loss, f"{prefix}_{self.val_metric}": rouge_tensor}
+        return {
+            "log": metrics,
+            "preds": preds,
+            f"{prefix}_loss": loss,
+            f"{prefix}_{self.val_metric}": rouge_tensor,
+        }
     def save_metrics(self, latest_metrics, type_path) -> None:
         self.metrics[type_path].append(latest_metrics)
@@ -200,7 +246,9 @@ class SummarizationModule(BaseTransformer):
         base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
         rouge: Dict = self.calc_generative_metrics(preds, target)
         summ_len = np.mean(lmap(len, generated_ids))
-        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge)
+        base_metrics.update(
+            gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge
+        )
         return base_metrics
     def test_step(self, batch, batch_idx):
@@ -213,10 +261,10 @@ class SummarizationModule(BaseTransformer):
         max_target_length = self.target_lens[type_path]
         data_config = DataConfig(
             endpoint=args.endpoint,
-            access_key=os.environ['access_key'],
+            access_key=os.environ["access_key"],
-            secret_key=os.environ['secret_key'],
+            secret_key=os.environ["secret_key"],
             region=args.region,
-            dataset_name='commit-autosuggestions',
+            dataset_name="commit-autosuggestions",
             additional={
                 "mode": ("training" if type_path == "train" else "evaluation"),
                 "max_source_length": self.hparams.max_source_length,
@@ -224,15 +272,17 @@ class SummarizationModule(BaseTransformer):
                 "url": args.url,
             },
             attributes=[
-                ('input_ids', 'int32', (self.hparams.max_source_length,)),
+                ("input_ids", "int32", (self.hparams.max_source_length,)),
-                ('attention_masks', 'int32', (self.hparams.max_source_length,)),
+                ("attention_masks", "int32", (self.hparams.max_source_length,)),
-                ('patch_ids', 'int32', (self.hparams.max_source_length,)),
+                ("patch_ids", "int32", (self.hparams.max_source_length,)),
-                ('targets', 'int32', (max_target_length,))
+                ("targets", "int32", (max_target_length,)),
-            ]
+            ],
         )
         return Dataset(config=data_config, clear=True)
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
+    def get_dataloader(
+        self, type_path: str, batch_size: int, shuffle: bool = False
+    ) -> DataLoader:
         dataset = self.get_dataset(type_path)
         sampler = None
@@ -246,7 +296,9 @@ class SummarizationModule(BaseTransformer):
         return dataloader
     def train_dataloader(self) -> DataLoader:
-        dataloader = self.get_dataloader("train", batch_size=self.hparams.train_batch_size, shuffle=True)
+        dataloader = self.get_dataloader(
+            "train", batch_size=self.hparams.train_batch_size, shuffle=True
+        )
         return dataloader
     def val_dataloader(self) -> DataLoader:
@@ -259,23 +311,18 @@ class SummarizationModule(BaseTransformer):
     def add_model_specific_args(parser, root_dir):
         BaseTransformer.add_model_specific_args(parser, root_dir)
         add_generic_args(parser, root_dir)
-        parser.add_argument(
+        parser.add_argument("--url", type=str, required=True, help="github url")
-            "--url",
-            type=str,
-            required=True,
-            help="github url"
-        )
         parser.add_argument(
             "--endpoint",
             type=str,
             required=True,
-            help='matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html'
+            help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
         )
         parser.add_argument(
             "--region",
             type=str,
             default=None,
-            help='matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html'
+            help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
         )
         parser.add_argument(
             "--max_source_length",
@@ -308,14 +355,43 @@ class SummarizationModule(BaseTransformer):
         parser.add_argument("--freeze_encoder", action="store_true")
         parser.add_argument("--freeze_embeds", action="store_true")
         parser.add_argument("--sortish_sampler", action="store_true", default=False)
-        parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default")
-        parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
         parser.add_argument(
-            "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all."
+            "--logger_name",
+            type=str,
+            choices=["default", "wandb", "wandb_shared"],
+            default="default",
+        )
+        parser.add_argument(
+            "--n_train",
+            type=int,
+            default=-1,
+            required=False,
+            help="# examples. -1 means use all.",
+        )
+        parser.add_argument(
+            "--n_val",
+            type=int,
+            default=500,
+            required=False,
+            help="# examples. -1 means use all.",
+        )
+        parser.add_argument(
+            "--n_test",
+            type=int,
+            default=-1,
+            required=False,
+            help="# examples. -1 means use all.",
+        )
+        parser.add_argument(
+            "--task",
+            type=str,
+            default="summarization",
+            required=False,
+            help="# examples. -1 means use all.",
+        )
+        parser.add_argument(
+            "--label_smoothing", type=float, default=0.0, required=False
         )
-        parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
         parser.add_argument("--src_lang", type=str, default="", required=False)
         parser.add_argument("--tgt_lang", type=str, default="", required=False)
         parser.add_argument("--eval_beams", type=int, default=None, required=False)
@@ -348,7 +424,11 @@ class TranslationModule(SummarizationModule):
 def main(args, model=None) -> SummarizationModule:
     Path(args.output_dir).mkdir(exist_ok=True)
     if len(os.listdir(args.output_dir)) > 3 and args.do_train:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty.".format(
+                args.output_dir
+            )
+        )
     if model is None:
         if args.task == "summarization":
             model: SummarizationModule = SummarizationModule(args)
@@ -371,7 +451,9 @@ def main(args, model=None) -> SummarizationModule:
         return model
     model.hparams.test_checkpoint = ""
-    checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True)))
+    checkpoints = list(
+        sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))
+    )
     if checkpoints:
         model.hparams.test_checkpoint = checkpoints[-1]
         trainer.resume_from_checkpoint = checkpoints[-1]
--- a/train/generation_utils.py
View file @9b9ed4f
+++ b/train/generation_utils.py
View file @9b9ed4f
@@ -30,6 +30,7 @@ logging.basicConfig(
     level=logging.INFO,
 )
+
 class GenerationMixin:
     """
     A class contraining all of the functions supporting generation, to be used as a mixin in
@@ -50,7 +51,9 @@ class GenerationMixin:
         """
         return logits
-    def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
+    def enforce_repetition_penalty_(
+        self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty
+    ):
         """
         Enforce the repetition penalty (from the `CTRL paper <https://arxiv.org/abs/1909.05858>`__).
         """
@@ -79,11 +82,7 @@ class GenerationMixin:
         # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
         if repetition_penalty != 1.0:
             self.enforce_repetition_penalty_(
-                scores,
+                scores, batch_size, num_beams, input_ids, repetition_penalty,
-                batch_size,
-                num_beams,
-                input_ids,
-                repetition_penalty,
             )
         # set eos token prob to zero if min_length is not reached
@@ -102,7 +101,11 @@ class GenerationMixin:
         if bad_words_ids is not None:
             # Exclude EOS token (already processed)
-            bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids))
+            bad_words_ids = list(
+                filter(
+                    lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids
+                )
+            )
             # calculate a list of banned tokens according to bad words
             banned_tokens = calc_banned_bad_words_ids(input_ids.tolist(), bad_words_ids)
             # Modify the scores in place by setting the banned tokens logits to `-inf`
@@ -134,7 +137,7 @@ class GenerationMixin:
         attention_mask: Optional[torch.LongTensor] = None,
         decoder_start_token_id: Optional[int] = None,
         use_cache: Optional[bool] = None,
-        **model_kwargs
+        **model_kwargs,
     ) -> torch.LongTensor:
         r"""
         Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
@@ -262,26 +265,50 @@ class GenerationMixin:
         max_length = max_length if max_length is not None else self.config.max_length
         min_length = min_length if min_length is not None else self.config.min_length
         do_sample = do_sample if do_sample is not None else self.config.do_sample
-        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+        early_stopping = (
+            early_stopping if early_stopping is not None else self.config.early_stopping
+        )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         num_beams = num_beams if num_beams is not None else self.config.num_beams
-        temperature = temperature if temperature is not None else self.config.temperature
+        temperature = (
+            temperature if temperature is not None else self.config.temperature
+        )
         top_k = top_k if top_k is not None else self.config.top_k
         top_p = top_p if top_p is not None else self.config.top_p
-        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        repetition_penalty = (
-        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+            repetition_penalty
-        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+            if repetition_penalty is not None
-        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+            else self.config.repetition_penalty
-        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        )
+        bos_token_id = (
+            bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        )
+        pad_token_id = (
+            pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        )
+        eos_token_id = (
+            eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        )
+        length_penalty = (
+            length_penalty if length_penalty is not None else self.config.length_penalty
+        )
         no_repeat_ngram_size = (
-            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+            no_repeat_ngram_size
+            if no_repeat_ngram_size is not None
+            else self.config.no_repeat_ngram_size
+        )
+        bad_words_ids = (
+            bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
         )
-        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
         num_return_sequences = (
-            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+            num_return_sequences
+            if num_return_sequences is not None
+            else self.config.num_return_sequences
         )
         decoder_start_token_id = (
-            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else self.config.decoder_start_token_id
         )
         if input_ids is not None:
@@ -289,14 +316,22 @@ class GenerationMixin:
         else:
             batch_size = 1
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
+        assert (
-        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
+            isinstance(max_length, int) and max_length > 0
+        ), "`max_length` should be a strictly positive integer."
+        assert (
+            isinstance(min_length, int) and min_length >= 0
+        ), "`min_length` should be a positive integer."
         assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
         assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
         assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
-        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
+        assert (
+            isinstance(num_beams, int) and num_beams > 0
+        ), "`num_beams` should be a strictly positive integer."
         assert temperature > 0, "`temperature` should be strictly positive."
-        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+        assert (
+            isinstance(top_k, int) and top_k >= 0
+        ), "`top_k` should be a positive integer."
         assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
         assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
         assert input_ids is not None or (
@@ -316,7 +351,9 @@ class GenerationMixin:
             isinstance(num_return_sequences, int) and num_return_sequences > 0
         ), "`num_return_sequences` should be a strictly positive integer."
         assert (
-            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
+            bad_words_ids is None
+            or isinstance(bad_words_ids, list)
+            and isinstance(bad_words_ids[0], list)
         ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
         if input_ids is None:
@@ -331,7 +368,9 @@ class GenerationMixin:
                 device=next(self.parameters()).device,
             )
         else:
-            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
+            assert (
+                input_ids.dim() == 2
+            ), "Input prompt should be of shape (batch_size, sequence length)."
         # not allow to duplicate outputs when greedy decoding
         if do_sample is False:
@@ -349,7 +388,11 @@ class GenerationMixin:
         # create attention mask if necessary
         # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
-        if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids):
+        if (
+            (attention_mask is None)
+            and (pad_token_id is not None)
+            and (pad_token_id in input_ids)
+        ):
             attention_mask = input_ids.ne(pad_token_id).long()
         elif attention_mask is None:
             attention_mask = input_ids.new_ones(input_ids.shape)
@@ -358,7 +401,9 @@ class GenerationMixin:
         # attention_mask is created
         if pad_token_id is None and eos_token_id is not None:
             logger.warning(
-                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
+                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(
+                    eos_token_id
+                )
             )
             pad_token_id = eos_token_id
@@ -385,25 +430,37 @@ class GenerationMixin:
                 # see if BOS token can be used for decoder_start_token_id
                 if bos_token_id is not None:
                     decoder_start_token_id = bos_token_id
-                elif hasattr(self.config, "decoder") and hasattr(self.config.decoder, "bos_token_id"):
+                elif hasattr(self.config, "decoder") and hasattr(
+                    self.config.decoder, "bos_token_id"
+                ):
                     decoder_start_token_id = self.config.decoder.bos_token_id
                 else:
                     raise ValueError(
                         "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
                     )
-            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
+            assert hasattr(
-            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+                self, "get_encoder"
+            ), "{} should have a 'get_encoder' function defined".format(self)
+            assert callable(self.get_encoder), "{} should be a method".format(
+                self.get_encoder
+            )
             # get encoder and store encoder outputs
             encoder = self.get_encoder()
-            encoder_outputs: ModelOutput = encoder(input_ids, patch_ids, attention_mask=attention_mask, return_dict=True)
+            encoder_outputs: ModelOutput = encoder(
+                input_ids, patch_ids, attention_mask=attention_mask, return_dict=True
+            )
         # Expand input ids if num_beams > 1 or num_return_sequences > 1
         if num_return_sequences > 1 or num_beams > 1:
             input_ids_len = input_ids.shape[-1]
-            input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len)
+            input_ids = input_ids.unsqueeze(1).expand(
-            patch_ids = patch_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len)
+                batch_size, effective_batch_mult * num_beams, input_ids_len
+            )
+            patch_ids = patch_ids.unsqueeze(1).expand(
+                batch_size, effective_batch_mult * num_beams, input_ids_len
+            )
             attention_mask = attention_mask.unsqueeze(1).expand(
                 batch_size, effective_batch_mult * num_beams, input_ids_len
             )
@@ -442,9 +499,9 @@ class GenerationMixin:
             )
             # expand encoder_outputs
-            encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
+            encoder_outputs[
-                0, expanded_batch_idxs
+                "last_hidden_state"
-            )
+            ] = encoder_outputs.last_hidden_state.index_select(0, expanded_batch_idxs)
             # save encoder_outputs in `model_kwargs`
             model_kwargs["encoder_outputs"] = encoder_outputs
@@ -534,7 +591,11 @@ class GenerationMixin:
         past = None
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs
+                input_ids,
+                past=past,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                **model_kwargs,
             )
             outputs = self(**model_inputs, return_dict=True)
@@ -565,7 +626,9 @@ class GenerationMixin:
                 if temperature != 1.0:
                     scores = scores / temperature
                 # Top-p/top-k filtering
-                next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p)
+                next_token_logscores = top_k_top_p_filtering(
+                    scores, top_k=top_k, top_p=top_p
+                )
                 # Sample
                 probs = F.softmax(next_token_logscores, dim=-1)
                 next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
@@ -576,7 +639,9 @@ class GenerationMixin:
             # update generations and finished sentences
             if eos_token_id is not None:
                 # pad finished sentences if eos_token_id exist
-                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
+                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (
+                    1 - unfinished_sents
+                )
             else:
                 tokens_to_add = next_token
@@ -587,8 +652,12 @@ class GenerationMixin:
             if eos_token_id is not None:
                 eos_in_sents = tokens_to_add == eos_token_id
                 # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
-                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool()
+                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(
-                sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len)
+                    eos_in_sents.long()
+                ).bool()
+                sent_lengths.masked_fill_(
+                    is_sents_unfinished_and_token_to_add_is_eos, cur_len
+                )
                 # unfinished_sents is set to zero if eos in sentence
                 unfinished_sents.mul_((~eos_in_sents).long())
@@ -599,7 +668,11 @@ class GenerationMixin:
             # extend attention_mask for new generated input if only decoder
             if self.config.is_encoder_decoder is False:
                 attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                    [
+                        attention_mask,
+                        attention_mask.new_ones((attention_mask.shape[0], 1)),
+                    ],
+                    dim=-1,
                 )
         return input_ids
@@ -633,12 +706,16 @@ class GenerationMixin:
         # generated hypotheses
         generated_hyps = [
-            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping)
+            BeamHypotheses(
+                num_beams, max_length, length_penalty, early_stopping=early_stopping
+            )
             for _ in range(batch_size)
         ]
         # scores for each sentence in the beam
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores = torch.zeros(
+            (batch_size, num_beams), dtype=torch.float, device=input_ids.device
+        )
         # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
         if do_sample is False:
@@ -653,10 +730,18 @@ class GenerationMixin:
         while cur_len < max_length:
             model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs
+                input_ids,
+                past=past,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                **model_kwargs,
             )
-            outputs = self(**model_inputs, return_dict=True)  # (batch_size * num_beams, cur_len, vocab_size)
+            outputs = self(
-            next_token_logits = outputs.logits[:, -1, :]  # (batch_size * num_beams, vocab_size)
+                **model_inputs, return_dict=True
+            )  # (batch_size * num_beams, cur_len, vocab_size)
+            next_token_logits = outputs.logits[
+                :, -1, :
+            ]  # (batch_size * num_beams, vocab_size)
             # if model has past, then set the past variable to speed up decoding
             if "past_key_values" in outputs:
@@ -670,7 +755,9 @@ class GenerationMixin:
                     next_token_logits, cur_len=cur_len, max_length=max_length
                 )
-            scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
+            scores = F.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
             scores = self.postprocess_next_token_scores(
                 scores=scores,
@@ -686,12 +773,17 @@ class GenerationMixin:
                 num_beams=num_beams,
             )
-            assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format(
+            assert scores.shape == (
+                batch_size * num_beams,
+                vocab_size,
+            ), "Shapes of scores: {} != {}".format(
                 scores.shape, (batch_size * num_beams, vocab_size)
             )
             if do_sample:
-                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                _scores = scores + beam_scores[:, None].expand_as(
+                    scores
+                )  # (batch_size * num_beams, vocab_size)
                 # Temperature
                 if temperature != 1.0:
                     _scores = _scores / temperature
@@ -706,24 +798,38 @@ class GenerationMixin:
                 # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
                 probs = F.softmax(_scores, dim=-1)
-                next_tokens = torch.multinomial(probs, num_samples=2 * num_beams)  # (batch_size, num_beams * 2)
+                next_tokens = torch.multinomial(
+                    probs, num_samples=2 * num_beams
+                )  # (batch_size, num_beams * 2)
                 # Compute next scores
-                next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
+                next_scores = torch.gather(
+                    _scores, -1, next_tokens
+                )  # (batch_size, num_beams * 2)
                 # sort the sampled vector to make sure that the first num_beams samples are the best
-                next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1)
+                next_scores, next_scores_indices = torch.sort(
-                next_tokens = torch.gather(next_tokens, -1, next_scores_indices)  # (batch_size, num_beams * 2)
+                    next_scores, descending=True, dim=1
+                )
+                next_tokens = torch.gather(
+                    next_tokens, -1, next_scores_indices
+                )  # (batch_size, num_beams * 2)
             else:
-                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                next_scores = scores + beam_scores[:, None].expand_as(
+                    scores
+                )  # (batch_size * num_beams, vocab_size)
                 # re-organize to group the beam together (we are keeping top hypothesis accross beams)
                 next_scores = next_scores.view(
                     batch_size, num_beams * vocab_size
                 )  # (batch_size, num_beams * vocab_size)
-                next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
+                next_scores, next_tokens = torch.topk(
+                    next_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+                )
-            assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
+            assert (
+                next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
+            )
             # next batch beam content
             next_batch_beam = []
@@ -735,11 +841,15 @@ class GenerationMixin:
                 if done[batch_idx]:
                     assert (
                         len(generated_hyps[batch_idx]) >= num_beams
-                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
+                    ), "Batch can only be done if at least {} beams have been generated".format(
+                        num_beams
+                    )
                     assert (
                         eos_token_id is not None and pad_token_id is not None
                     ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
-                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    next_batch_beam.extend(
+                        [(0, pad_token_id, 0)] * num_beams
+                    )  # pad the batch
                     continue
                 # next sentence beam content, this will get added to next_batch_beam
@@ -757,7 +867,9 @@ class GenerationMixin:
                     # add to generated hypotheses if end of sentence
                     if (eos_token_id is not None) and (token_id.item() == eos_token_id):
                         # if beam_token does not belong to top num_beams tokens, it should not be added
-                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
+                        is_beam_token_worse_than_top_num_beams = (
+                            beam_token_rank >= num_beams
+                        )
                         if is_beam_token_worse_than_top_num_beams:
                             continue
                         generated_hyps[batch_idx].add(
@@ -766,7 +878,9 @@ class GenerationMixin:
                         )
                     else:
                         # add next predicted token since it is not eos_token
-                        next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
+                        next_sent_beam.append(
+                            (beam_token_score, token_id, effective_beam_id)
+                        )
                     # once the beam for next step is full, don't add more tokens to it.
                     if len(next_sent_beam) == num_beams:
@@ -780,7 +894,9 @@ class GenerationMixin:
                 # update next beam content
                 assert len(next_sent_beam) == num_beams, "Beam should always be full"
                 next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step"
+                assert len(next_batch_beam) == num_beams * (
+                    batch_idx + 1
+                ), "We should have added num_beams each step"
             # stop when we are done with each sentence
             if all(done):
@@ -804,7 +920,11 @@ class GenerationMixin:
             # extend attention_mask for new generated input if only decoder
             if self.config.is_encoder_decoder is False:
                 attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                    [
+                        attention_mask,
+                        attention_mask.new_ones((attention_mask.shape[0], 1)),
+                    ],
+                    dim=-1,
                 )
         # finalize all open beam hypotheses and add to generated hypotheses
@@ -814,10 +934,12 @@ class GenerationMixin:
             # test that beam scores match previously calculated scores if not eos and batch_idx not done
             if eos_token_id is not None and all(
-                (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx]
+                (token_id % vocab_size).item() != eos_token_id
+                for token_id in next_tokens[batch_idx]
             ):
                 assert torch.all(
-                    next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
+                    next_scores[batch_idx, :num_beams]
+                    == beam_scores.view(batch_size, num_beams)[batch_idx]
                 ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
                     next_scores[:, :num_beams][batch_idx],
                     beam_scores.view(batch_size, num_beams)[batch_idx],
@@ -831,7 +953,9 @@ class GenerationMixin:
                 generated_hyps[batch_idx].add(final_tokens, final_score)
         # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
-        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+        output_batch_size = (
+            batch_size if do_sample else batch_size * num_return_sequences
+        )
         output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
         # select the best hypotheses
@@ -861,7 +985,9 @@ class GenerationMixin:
         else:
             # none of the hypotheses have an eos_token
             assert (len(hypo) == max_length for hypo in best)
-            decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device)
+            decoded = (
+                torch.stack(best).type(torch.long).to(next(self.parameters()).device)
+            )
         return decoded
@@ -870,7 +996,9 @@ class GenerationMixin:
         return tuple(layer_past.index_select(1, beam_idx) for layer_past in past)
-def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None:
+def calc_banned_ngram_tokens(
+    prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int
+) -> None:
     """Copied from fairseq for no_repeat_ngram in beam_search"""
     if cur_len + 1 < no_repeat_ngram_size:
         # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
@@ -881,7 +1009,9 @@ def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_n
         generated_ngram = generated_ngrams[idx]
         for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
             prev_ngram_tuple = tuple(ngram[:-1])
-            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(
+                prev_ngram_tuple, []
+            ) + [ngram[-1]]
     def _get_generated_ngrams(hypo_idx):
         # Before decoding the next token, prevent decoding of ngrams that have already appeared
@@ -893,7 +1023,9 @@ def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_n
     return banned_tokens
-def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]) -> Iterable[int]:
+def calc_banned_bad_words_ids(
+    prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]
+) -> Iterable[int]:
     banned_tokens = []
     def _tokens_match(prev_tokens, tokens):
@@ -914,7 +1046,9 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter
         banned_tokens_slice = []
         for banned_token_seq in bad_words_ids:
-            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
+            assert (
+                len(banned_token_seq) > 0
+            ), "Banned words token sequences {} cannot have an empty list".format(
                 bad_words_ids
             )
@@ -929,7 +1063,9 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter
     return banned_tokens
-def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
+def set_scores_to_inf_for_banned_tokens(
+    scores: torch.Tensor, banned_tokens: List[List[int]]
+) -> None:
     """Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
     a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
         Args:
@@ -949,7 +1085,12 @@ def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: Lis
     # [ 0  0  0 ]
     # [ 1  0  0 ]
-    banned_mask = torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()).to(scores.device).to_dense().bool()
+    banned_mask = (
+        torch.sparse.LongTensor(banned_mask.t(), indices, scores.size())
+        .to(scores.device)
+        .to_dense()
+        .bool()
+    )
     scores.masked_fill_(banned_mask, -float("inf"))
@@ -989,7 +1130,9 @@ def top_k_top_p_filtering(
         sorted_indices_to_remove[..., 0] = 0
         # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
         logits[indices_to_remove] = filter_value
     return logits
@@ -1020,7 +1163,9 @@ class BeamHypotheses(object):
         if len(self) < self.num_beams or score > self.worst_score:
             self.beams.append((score, hyp))
             if len(self) > self.num_beams:
-                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                sorted_scores = sorted(
+                    [(s, idx) for idx, (s, _) in enumerate(self.beams)]
+                )
                 del self.beams[sorted_scores[0][1]]
                 self.worst_score = sorted_scores[1][0]
             else:
--- a/train/lightning_base.py
View file @9b9ed4f
+++ b/train/lightning_base.py
View file @9b9ed4f
@@ -69,7 +69,7 @@ class BaseTransformer(pl.LightningModule):
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -83,7 +83,9 @@ class BaseTransformer(pl.LightningModule):
         cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
         if config is None:
             self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
+                self.hparams.config_name
+                if self.hparams.config_name
+                else self.hparams.model_name_or_path,
                 **({"num_labels": num_labels} if num_labels is not None else {}),
                 cache_dir=cache_dir,
                 **config_kwargs,
@@ -91,15 +93,24 @@ class BaseTransformer(pl.LightningModule):
         else:
             self.config: PretrainedConfig = config
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
+        extra_model_params = (
+            "encoder_layerdrop",
+            "decoder_layerdrop",
+            "dropout",
+            "attention_dropout",
+        )
         for p in extra_model_params:
             if getattr(self.hparams, p, None):
-                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
+                assert hasattr(
+                    self.config, p
+                ), f"model config doesn't have a `{p}` attribute"
                 setattr(self.config, p, getattr(self.hparams, p))
         if tokenizer is None:
             self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
+                self.hparams.tokenizer_name
+                if self.hparams.tokenizer_name
+                else self.hparams.model_name_or_path,
                 cache_dir=cache_dir,
             )
         else:
@@ -121,7 +132,9 @@ class BaseTransformer(pl.LightningModule):
     def get_lr_scheduler(self):
         get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
         scheduler = get_schedule_func(
-            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps
+            self.opt,
+            num_warmup_steps=self.hparams.warmup_steps,
+            num_training_steps=self.total_steps,
         )
         scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
         return scheduler
@@ -132,22 +145,35 @@ class BaseTransformer(pl.LightningModule):
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if not any(nd in n for nd in no_decay)
+                ],
                 "weight_decay": self.hparams.weight_decay,
             },
             {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if any(nd in n for nd in no_decay)
+                ],
                 "weight_decay": 0.0,
             },
         ]
         if self.hparams.adafactor:
             optimizer = Adafactor(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
+                optimizer_grouped_parameters,
+                lr=self.hparams.learning_rate,
+                scale_parameter=False,
+                relative_step=False,
             )
         else:
             optimizer = AdamW(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
+                optimizer_grouped_parameters,
+                lr=self.hparams.learning_rate,
+                eps=self.hparams.adam_epsilon,
             )
         self.opt = optimizer
@@ -165,13 +191,19 @@ class BaseTransformer(pl.LightningModule):
     def total_steps(self) -> int:
         """The number of total training steps that will be run. Used for lr scheduler purposes."""
         num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
+        effective_batch_size = (
+            self.hparams.train_batch_size
+            * self.hparams.accumulate_grad_batches
+            * num_devices
+        )
         dataset_size = len(self.train_loader.dataset)
         return (dataset_size / effective_batch_size) * self.hparams.max_epochs
     def setup(self, mode):
         if mode == "fit":
-            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
+            self.train_loader = self.get_dataloader(
+                "train", self.hparams.train_batch_size, shuffle=True
+            )
     def get_dataloader(self, type_path, batch_size, shuffle=False):
         raise NotImplementedError("You must implement this for your task")
@@ -212,7 +244,10 @@ class BaseTransformer(pl.LightningModule):
             help="Path to pretrained model or model identifier from huggingface.co/models",
         )
         parser.add_argument(
-            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+            "--config_name",
+            default="",
+            type=str,
+            help="Pretrained config name or path if not the same as model_name",
         )
         parser.add_argument(
             "--tokenizer_name",
@@ -246,7 +281,12 @@ class BaseTransformer(pl.LightningModule):
             type=float,
             help="Attention dropout probability (Optional). Goes into model.config",
         )
-        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+        parser.add_argument(
+            "--learning_rate",
+            default=5e-5,
+            type=float,
+            help="The initial learning rate for Adam.",
+        )
         parser.add_argument(
             "--lr_scheduler",
             default="linear",
@@ -255,11 +295,30 @@ class BaseTransformer(pl.LightningModule):
             type=str,
             help="Learning rate scheduler",
         )
-        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+        parser.add_argument(
-        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+            "--weight_decay",
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+            default=0.0,
-        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
+            type=float,
-        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
+            help="Weight decay if we apply some.",
+        )
+        parser.add_argument(
+            "--adam_epsilon",
+            default=1e-8,
+            type=float,
+            help="Epsilon for Adam optimizer.",
+        )
+        parser.add_argument(
+            "--warmup_steps",
+            default=0,
+            type=int,
+            help="Linear warmup over warmup_steps.",
+        )
+        parser.add_argument(
+            "--num_workers", default=4, type=int, help="kwarg passed to DataLoader"
+        )
+        parser.add_argument(
+            "--num_train_epochs", dest="max_epochs", default=3, type=int
+        )
         parser.add_argument("--train_batch_size", default=32, type=int)
         parser.add_argument("--eval_batch_size", default=32, type=int)
         parser.add_argument("--adafactor", action="store_true")
@@ -283,7 +342,9 @@ class LoggingCallback(pl.Callback):
         rank_zero_info("***** Test results *****")
         metrics = trainer.callback_metrics
         # Log and save results to file
-        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
+        output_test_results_file = os.path.join(
+            pl_module.hparams.output_dir, "test_results.txt"
+        )
         with open(output_test_results_file, "w") as writer:
             for key in sorted(metrics):
                 if key not in ["log", "progress_bar"]:
@@ -314,9 +375,21 @@ def add_generic_args(parser, root_dir) -> None:
         "See details at https://nvidia.github.io/apex/amp.html",
     )
     parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
+    parser.add_argument(
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+        "--max_grad_norm",
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+        dest="gradient_clip_val",
+        default=1.0,
+        type=float,
+        help="Max gradient norm",
+    )
+    parser.add_argument(
+        "--do_train", action="store_true", help="Whether to run training."
+    )
+    parser.add_argument(
+        "--do_predict",
+        action="store_true",
+        help="Whether to run predictions on the test set.",
+    )
     parser.add_argument(
         "--gradient_accumulation_steps",
         dest="accumulate_grad_batches",
@@ -324,7 +397,9 @@ def add_generic_args(parser, root_dir) -> None:
         default=1,
         help="Number of updates steps to accumulate before performing a backward/update pass.",
     )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--seed", type=int, default=42, help="random seed for initialization"
+    )
 def generic_train(
@@ -335,7 +410,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
@@ -346,7 +421,11 @@ def generic_train(
     # add custom checkpoints
     if checkpoint_callback is None:
         checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
+            filepath=args.output_dir,
+            prefix="checkpoint",
+            monitor="val_loss",
+            mode="min",
+            save_top_k=1,
         )
     if logging_callback is None:
         logging_callback = LoggingCallback()
--- a/train/modeling_bart.py
View file @9b9ed4f
+++ b/train/modeling_bart.py
View file @9b9ed4f
@@ -141,7 +141,11 @@ def invert_mask(attention_mask):
 def _prepare_bart_decoder_inputs(
-    config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32
+    config,
+    input_ids,
+    decoder_input_ids=None,
+    decoder_padding_mask=None,
+    causal_mask_dtype=torch.float32,
 ):
     """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
     none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
@@ -184,7 +188,9 @@ class PretrainedBartModel(PreTrainedModel):
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
-        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        input_ids = torch.tensor(
+            [[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device
+        )
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
@@ -229,7 +235,11 @@ class EncoderLayer(nn.Module):
     def __init__(self, config: BartConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        self.self_attn = Attention(self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout)
+        self.self_attn = Attention(
+            self.embed_dim,
+            config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
         self.normalize_before = config.normalize_before
         self.self_attn_layer_norm = LayerNorm(self.embed_dim)
         self.dropout = config.dropout
@@ -255,7 +265,10 @@ class EncoderLayer(nn.Module):
         if self.normalize_before:
             x = self.self_attn_layer_norm(x)
         x, attn_weights = self.self_attn(
-            query=x, key=x, key_padding_mask=encoder_padding_mask, output_attentions=output_attentions
+            query=x,
+            key=x,
+            key_padding_mask=encoder_padding_mask,
+            output_attentions=output_attentions,
         )
         x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
@@ -308,13 +321,23 @@ class BartEncoder(nn.Module):
                 config.extra_pos_embeddings,
             )
         self.embed_patches = nn.Embedding(3, config.d_model, padding_idx=0)
-        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layers = nn.ModuleList(
-        self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
+            [EncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+        self.layernorm_embedding = (
+            LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
+        )
         # mbart has one extra layer_norm
         self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None
     def forward(
-        self, input_ids, patch_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False
+        self,
+        input_ids,
+        patch_ids,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
     ):
         """
         Args:
@@ -352,10 +375,14 @@ class BartEncoder(nn.Module):
                 encoder_states.append(x)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+            if self.training and (
+                dropout_probability < self.layerdrop
+            ):  # skip the layer
                 attn = None
             else:
-                x, attn = encoder_layer(x, attention_mask, output_attentions=output_attentions)
+                x, attn = encoder_layer(
+                    x, attention_mask, output_attentions=output_attentions
+                )
             if output_attentions:
                 all_attentions = all_attentions + (attn,)
@@ -365,14 +392,20 @@ class BartEncoder(nn.Module):
         if output_hidden_states:
             encoder_states.append(x)
             # T x B x C -> B x T x C
-            encoder_states = tuple(hidden_state.transpose(0, 1) for hidden_state in encoder_states)
+            encoder_states = tuple(
+                hidden_state.transpose(0, 1) for hidden_state in encoder_states
+            )
         # T x B x C -> B x T x C
         x = x.transpose(0, 1)
         if not return_dict:
-            return tuple(v for v in [x, encoder_states, all_attentions] if v is not None)
+            return tuple(
-        return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)
+                v for v in [x, encoder_states, all_attentions] if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions
+        )
 class DecoderLayer(nn.Module):
@@ -498,8 +531,12 @@ class BartDecoder(nn.Module):
         self.layers = nn.ModuleList(
             [DecoderLayer(config) for _ in range(config.decoder_layers)]
         )  # type: List[DecoderLayer]
-        self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
+        self.layernorm_embedding = (
-        self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
+            LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
+        )
+        self.layer_norm = (
+            LayerNorm(config.d_model) if config.add_final_layer_norm else None
+        )
     def forward(
         self,
@@ -595,23 +632,34 @@ class BartDecoder(nn.Module):
             if use_cache:
                 next_decoder_cache.append(layer_past.copy())
-            if self.layer_norm and (idx == len(self.layers) - 1):  # if config.add_final_layer_norm (mBART)
+            if self.layer_norm and (
+                idx == len(self.layers) - 1
+            ):  # if config.add_final_layer_norm (mBART)
                 x = self.layer_norm(x)
             if output_attentions:
                 all_self_attns += (layer_self_attn,)
         # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
         if output_hidden_states:
-            all_hidden_states = tuple(hidden_state.transpose(0, 1) for hidden_state in all_hidden_states)
+            all_hidden_states = tuple(
+                hidden_state.transpose(0, 1) for hidden_state in all_hidden_states
+            )
         x = x.transpose(0, 1)
         encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
-            return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None)
+            return tuple(
+                v
+                for v in [x, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
         return BaseModelOutputWithPast(
-            last_hidden_state=x, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns
+            last_hidden_state=x,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
         )
@@ -638,7 +686,9 @@ class Attention(nn.Module):
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
         self.scaling = self.head_dim ** -0.5
         self.encoder_decoder_attention = encoder_decoder_attention
@@ -649,7 +699,11 @@ class Attention(nn.Module):
         self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"
     def _shape(self, tensor, seq_len, bsz):
-        return tensor.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        return (
+            tensor.contiguous()
+            .view(seq_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
     def forward(
         self,
@@ -693,7 +747,9 @@ class Attention(nn.Module):
             v = self._shape(v, -1, bsz)
         if saved_state is not None:
-            k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz)
+            k, v, key_padding_mask = self._use_saved_state(
+                k, v, saved_state, key_padding_mask, static_kv, bsz
+            )
         # Update cache
         layer_state[self.cache_key] = {
@@ -708,7 +764,9 @@ class Attention(nn.Module):
         assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
         if attn_mask is not None:
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
@@ -725,16 +783,14 @@ class Attention(nn.Module):
             attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
-        attn_probs = F.dropout(
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
-            attn_weights,
-            p=self.dropout,
-            training=self.training,
-        )
         assert v is not None
         attn_output = torch.bmm(attn_probs, v)
         assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn_output = (
+            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        )
         attn_output = self.out_proj(attn_output)
         if output_attentions:
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
@@ -763,12 +819,16 @@ class Attention(nn.Module):
                 assert v is not None
                 v = torch.cat([prev_value, v], dim=1)
         assert k is not None and v is not None
-        prev_key_padding_mask: Optional[Tensor] = saved_state.get("prev_key_padding_mask", None)
+        prev_key_padding_mask: Optional[Tensor] = saved_state.get(
+            "prev_key_padding_mask", None
+        )
         if prev_key_padding_mask is not None:
             if static_kv:
                 new_key_padding_mask = prev_key_padding_mask
             else:
-                new_key_padding_mask = torch.cat([prev_key_padding_mask, key_padding_mask], dim=1)
+                new_key_padding_mask = torch.cat(
+                    [prev_key_padding_mask, key_padding_mask], dim=1
+                )
         else:
             new_key_padding_mask = key_padding_mask
         return k, v, new_key_padding_mask
@@ -780,11 +840,7 @@ class BartClassificationHead(nn.Module):
     # This can trivially be shared with RobertaClassificationHead
     def __init__(
-        self,
+        self, input_dim, inner_dim, num_classes, pooler_dropout,
-        input_dim,
-        inner_dim,
-        num_classes,
-        pooler_dropout,
     ):
         super().__init__()
         self.dense = nn.Linear(input_dim, inner_dim)
@@ -808,7 +864,9 @@ class LearnedPositionalEmbedding(nn.Embedding):
     position ids are passed to the forward function.
     """
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset):
+    def __init__(
+        self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset
+    ):
         # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
         # and adjust num_embeddings appropriately. Other models dont have this hack
         self.offset = offset
@@ -820,10 +878,14 @@ class LearnedPositionalEmbedding(nn.Embedding):
         """Input is expected to be of size [bsz x seqlen]."""
         bsz, seq_len = input_ids.shape[:2]
         if use_cache:
-            positions = input_ids.data.new(1, 1).fill_(seq_len - 1)  # called before slicing
+            positions = input_ids.data.new(1, 1).fill_(
+                seq_len - 1
+            )  # called before slicing
         else:
             # starts at 0, ends at 1-seq_len
-            positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
+            positions = torch.arange(
+                seq_len, dtype=torch.long, device=self.weight.device
+            )
         return super().forward(positions + self.offset)
@@ -896,16 +958,28 @@ class BartModel(PretrainedBartModel):
         if decoder_input_ids is None:
             use_cache = False
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         # make masks if user doesn't supply
         if not use_cache:
-            decoder_input_ids, decoder_padding_mask, causal_mask = _prepare_bart_decoder_inputs(
+            (
+                decoder_input_ids,
+                decoder_padding_mask,
+                causal_mask,
+            ) = _prepare_bart_decoder_inputs(
                 self.config,
                 input_ids,
                 decoder_input_ids=decoder_input_ids,
@@ -974,17 +1048,24 @@ class BartModel(PretrainedBartModel):
 @add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
+    "The BART Model with a language modeling head. Can be used for summarization.",
+    BART_START_DOCSTRING,
 )
 class BartForConditionalGeneration(PretrainedBartModel):
     base_model_prefix = "model"
-    authorized_missing_keys = [r"final_logits_bias", r"encoder\.version", r"decoder\.version"]
+    authorized_missing_keys = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+    ]
     def __init__(self, config: BartConfig):
         super().__init__(config)
         base_model = BartModel(config)
         self.model = base_model
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.register_buffer(
+            "final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))
+        )
     def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
         old_num_tokens = self.model.shared.num_embeddings
@@ -993,16 +1074,23 @@ class BartForConditionalGeneration(PretrainedBartModel):
         self._resize_final_logits_bias(new_num_tokens, old_num_tokens)
         return new_embeddings
-    def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) -> None:
+    def _resize_final_logits_bias(
+        self, new_num_tokens: int, old_num_tokens: int
+    ) -> None:
         if new_num_tokens <= old_num_tokens:
             new_bias = self.final_logits_bias[:, :new_num_tokens]
         else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            extra_bias = torch.zeros(
+                (1, new_num_tokens - old_num_tokens),
+                device=self.final_logits_bias.device,
+            )
             new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
         self.register_buffer("final_logits_bias", new_bias)
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(
+        output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+    )
     @add_end_docstrings(BART_GENERATION_EXAMPLE)
     def forward(
         self,
@@ -1065,7 +1153,9 @@ class BartForConditionalGeneration(PretrainedBartModel):
                 FutureWarning,
             )
             past_key_values = unused.pop("decoder_past_key_values")
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if labels is not None:
             use_cache = False
@@ -1085,17 +1175,23 @@ class BartForConditionalGeneration(PretrainedBartModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
+        lm_logits = F.linear(
+            outputs[0], self.model.shared.weight, bias=self.final_logits_bias
+        )
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
             # TODO(SS): do we need to ignore pad tokens in labels?
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+            masked_lm_loss = loss_fct(
+                lm_logits.view(-1, self.config.vocab_size), labels.view(-1)
+            )
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
         return Seq2SeqLMOutput(
             loss=masked_lm_loss,
@@ -1109,7 +1205,13 @@ class BartForConditionalGeneration(PretrainedBartModel):
         )
     def prepare_inputs_for_generation(
-        self, decoder_input_ids, past, attention_mask, use_cache, encoder_outputs, **kwargs
+        self,
+        decoder_input_ids,
+        past,
+        attention_mask,
+        use_cache,
+        encoder_outputs,
+        **kwargs,
     ):
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
@@ -1130,7 +1232,9 @@ class BartForConditionalGeneration(PretrainedBartModel):
     def _force_token_ids_generation(self, scores, token_id) -> None:
         """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
-        scores[:, [x for x in range(self.config.vocab_size) if x != token_id]] = -float("inf")
+        scores[:, [x for x in range(self.config.vocab_size) if x != token_id]] = -float(
+            "inf"
+        )
     @staticmethod
     def _reorder_cache(past, beam_idx):
@@ -1138,7 +1242,8 @@ class BartForConditionalGeneration(PretrainedBartModel):
         for layer_past in past:
             # get the correct batch idx from decoder layer's batch dim for cross and self-attn
             layer_past_new = {
-                attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
+                attn_key: _reorder_buffer(attn_cache, beam_idx)
+                for attn_key, attn_cache in layer_past.items()
             }
             reordered_past.append(layer_past_new)
         return reordered_past
@@ -1159,10 +1264,7 @@ class BartForSequenceClassification(PretrainedBartModel):
         super().__init__(config, **kwargs)
         self.model = BartModel(config)
         self.classification_head = BartClassificationHead(
-            config.d_model,
+            config.d_model, config.d_model, config.num_labels, config.classif_dropout,
-            config.d_model,
-            config.num_labels,
-            config.classif_dropout,
         )
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
@@ -1193,7 +1295,9 @@ class BartForSequenceClassification(PretrainedBartModel):
             Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if labels is not None:
             use_cache = False
@@ -1212,7 +1316,9 @@ class BartForSequenceClassification(PretrainedBartModel):
         eos_mask = input_ids.eq(self.config.eos_token_id)
         if len(torch.unique(eos_mask.sum(1))) > 1:
             raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
+        sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[
+            :, -1, :
+        ]
         logits = self.classification_head(sentence_representation)
         loss = None
@@ -1284,7 +1390,9 @@ class BartForQuestionAnswering(PretrainedBartModel):
             Positions are clamped to the length of the sequence (`sequence_length`).
             Position outside of the sequence are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if start_positions is not None and end_positions is not None:
             use_cache = False
@@ -1325,10 +1433,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
             total_loss = (start_loss + end_loss) / 2
         if not return_dict:
-            output = (
+            output = (start_logits, end_logits,) + outputs[1:]
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
             return ((total_loss,) + output) if total_loss is not None else output
         return Seq2SeqQuestionAnsweringModelOutput(
@@ -1350,7 +1455,9 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
     def __init__(self, num_positions, embedding_dim, padding_idx=None):
         super().__init__(num_positions, embedding_dim)
         if embedding_dim % 2 != 0:
-            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
+            raise NotImplementedError(
+                f"odd embedding_dim {embedding_dim} not supported"
+            )
         self.weight = self._init_weight(self.weight)
     @staticmethod
@@ -1360,9 +1467,14 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
         """
         n_pos, dim = out.shape
         position_enc = np.array(
-            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+            [
+                [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+                for pos in range(n_pos)
+            ]
         )
-        out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))  # This line breaks for odd n_pos
+        out[:, 0 : dim // 2] = torch.FloatTensor(
+            np.sin(position_enc[:, 0::2])
+        )  # This line breaks for odd n_pos
         out[:, dim // 2 :] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
         out.detach_()
         out.requires_grad = False
@@ -1373,8 +1485,12 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
         """Input is expected to be of size [bsz x seqlen]."""
         bsz, seq_len = input_ids.shape[:2]
         if use_cache:
-            positions = input_ids.data.new(1, 1).fill_(seq_len - 1)  # called before slicing
+            positions = input_ids.data.new(1, 1).fill_(
+                seq_len - 1
+            )  # called before slicing
         else:
             # starts at 0, ends at 1-seq_len
-            positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
+            positions = torch.arange(
+                seq_len, dtype=torch.long, device=self.weight.device
+            )
         return super().forward(positions)
--- a/train/modeling_utils.py
View file @9b9ed4f
+++ b/train/modeling_utils.py
View file @9b9ed4f
@@ -80,7 +80,9 @@ def find_pruneable_heads_and_indices(
         :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
     """
     mask = torch.ones(n_heads, head_size)
-    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
+    heads = (
+        set(heads) - already_pruned_heads
+    )  # Convert to set and remove already pruned heads
     for head in heads:
         # Compute how many pruned heads are before the head and move the index accordingly
         head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
@@ -106,7 +108,11 @@ class ModuleUtilsMixin:
         Returns:
             :obj:`int`: The number of parameters.
         """
-        params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters()
+        params = (
+            filter(lambda x: x.requires_grad, self.parameters())
+            if only_trainable
+            else self.parameters()
+        )
         return sum(p.numel() for p in params)
     @staticmethod
@@ -114,7 +120,9 @@ class ModuleUtilsMixin:
         try:
             import psutil
         except (ImportError):
-            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+            raise ImportError(
+                "You need to install psutil (pip install psutil) to use memory tracing."
+            )
         process = psutil.Process(os.getpid())
         mem = process.memory_info()
@@ -126,13 +134,17 @@ class ModuleUtilsMixin:
         try:
             import psutil
         except (ImportError):
-            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+            raise ImportError(
+                "You need to install psutil (pip install psutil) to use memory tracing."
+            )
         process = psutil.Process(os.getpid())
         mem = process.memory_info()
         module.mem_rss_post_forward = mem.rss
         mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
-        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
+        module.mem_rss_diff = mem_rss_diff + (
+            module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0
+        )
         return None
     def add_memory_hooks(self):
@@ -169,7 +181,9 @@ class ModuleUtilsMixin:
             # For nn.DataParallel compatibility in PyTorch 1.5
             def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+                tuples = [
+                    (k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)
+                ]
                 return tuples
             gen = self._named_members(get_members_fn=find_tensor_attributes)
@@ -187,7 +201,9 @@ class ModuleUtilsMixin:
             # For nn.DataParallel compatibility in PyTorch 1.5
             def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
-                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+                tuples = [
+                    (k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)
+                ]
                 return tuples
             gen = self._named_members(get_members_fn=find_tensor_attributes)
@@ -213,12 +229,18 @@ class ModuleUtilsMixin:
         # /transformer/transformer_layers.py#L270
         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
         # encoder_extended_attention_mask.transpose(-1, -2))
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
         if self.dtype == torch.float16:
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+            encoder_extended_attention_mask = (
+                1.0 - encoder_extended_attention_mask
+            ) * -1e4
         elif self.dtype == torch.float32:
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+            encoder_extended_attention_mask = (
+                1.0 - encoder_extended_attention_mask
+            ) * -1e9
         else:
             raise ValueError(
                 "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format(
@@ -228,7 +250,9 @@ class ModuleUtilsMixin:
         return encoder_extended_attention_mask
-    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device) -> Tensor:
+    def get_extended_attention_mask(
+        self, attention_mask: Tensor, input_shape: Tuple[int], device: device
+    ) -> Tensor:
         """
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
@@ -254,10 +278,15 @@ class ModuleUtilsMixin:
             if self.config.is_decoder:
                 batch_size, seq_length = input_shape
                 seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
                 # causal and attention masks must have same type with pytorch version < 1.3
                 causal_mask = causal_mask.to(attention_mask.dtype)
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
         else:
@@ -272,12 +301,17 @@ class ModuleUtilsMixin:
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
         return extended_attention_mask
     def get_head_mask(
-        self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+        self,
+        head_mask: Optional[Tensor],
+        num_hidden_layers: int,
+        is_attention_chunked: bool = False,
     ) -> Tensor:
         """
         Prepare the head mask if needed.
@@ -309,9 +343,13 @@ class ModuleUtilsMixin:
             head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
             head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
         elif head_mask.dim() == 2:
-            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = (
+                head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+            )  # We can specify head_mask for each layer
         assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.to(dtype=self.dtype)  # switch to fload if need + fp16 compatibility
+        head_mask = head_mask.to(
+            dtype=self.dtype
+        )  # switch to fload if need + fp16 compatibility
         return head_mask
@@ -420,12 +458,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
         if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:
-            self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
+            self._tie_encoder_decoder_weights(
+                self.encoder, self.decoder, self.base_model_prefix
+            )
     @staticmethod
-    def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str):
+    def _tie_encoder_decoder_weights(
+        encoder: nn.Module, decoder: nn.Module, base_model_prefix: str
+    ):
         uninitialized_encoder_weights: List[str] = []
-        assert decoder.__class__ == encoder.__class__, f"{decoder.__class__} and {encoder.__class__} have to be equal."
+        assert (
+            decoder.__class__ == encoder.__class__
+        ), f"{decoder.__class__} and {encoder.__class__} have to be equal."
         def tie_encoder_to_decoder_recursively(
             decoder_pointer: nn.Module,
@@ -452,13 +496,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                     len(encoder_modules) > 0
                 ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
-                all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
+                all_encoder_weights = set(
+                    [
+                        module_name + "/" + sub_name
+                        for sub_name in encoder_modules.keys()
+                    ]
+                )
                 encoder_layer_pos = 0
                 for name, module in decoder_modules.items():
                     if name.isdigit():
                         encoder_name = str(int(name) + encoder_layer_pos)
                         decoder_name = name
-                        if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])):
+                        if not isinstance(
+                            decoder_modules[decoder_name],
+                            type(encoder_modules[encoder_name]),
+                        ):
                             # this can happen if the name corresponds to the position in a list module list of layers
                             # in this case the decoder has added a cross-attention that the encoder does not have
                             # thus skip this step and substract one layer pos from encoder
@@ -484,7 +536,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                 uninitialized_encoder_weights += list(all_encoder_weights)
         # tie weights recursively
-        tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights)
+        tie_encoder_to_decoder_recursively(
+            decoder, encoder, base_model_prefix, uninitialized_encoder_weights
+        )
         if len(uninitialized_encoder_weights) > 0:
             logger.warning(
                 f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}"
@@ -507,10 +561,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                 "constant",
                 0,
             )
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+        if hasattr(output_embeddings, "out_features") and hasattr(
+            input_embeddings, "num_embeddings"
+        ):
             output_embeddings.out_features = input_embeddings.num_embeddings
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
         """
         Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
@@ -526,7 +584,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         Return:
             :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
         """
-        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model = getattr(
+            self, self.base_model_prefix, self
+        )  # get the base model if needed
         model_embeds = base_model._resize_token_embeddings(new_num_tokens)
         if new_num_tokens is None:
             return model_embeds
@@ -583,7 +643,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         # Copy token embeddings from the previous weights
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[
+            :num_tokens_to_copy, :
+        ]
         return new_embeddings
@@ -614,7 +676,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
         for layer, heads in heads_to_prune.items():
             union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
-            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
+            self.config.pruned_heads[layer] = list(
+                union_heads
+            )  # Unfortunately we have to store it as list for JSON
         self.base_model._prune_heads(heads_to_prune)
@@ -628,7 +692,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                 Directory to which to save. Will be created if it doesn't exist.
         """
         if os.path.isfile(save_directory):
-            logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
+            logger.error(
+                "Provided path ({}) should be a directory, not a file".format(
+                    save_directory
+                )
+            )
             return
         os.makedirs(save_directory, exist_ok=True)
@@ -775,7 +843,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         # Load config if we don't provide a configuration
         if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
+            config_path = (
+                config if config is not None else pretrained_model_name_or_path
+            )
             config, model_kwargs = cls.config_class.from_pretrained(
                 config_path,
                 *model_args,
@@ -793,23 +863,43 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         # Load model
         if pretrained_model_name_or_path is not None:
             if os.path.isdir(pretrained_model_name_or_path):
-                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
+                if from_tf and os.path.isfile(
+                    os.path.join(
+                        pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index"
+                    )
+                ):
                     # Load from a TF 1.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                    archive_file = os.path.join(
-                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+                        pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index"
+                    )
+                elif from_tf and os.path.isfile(
+                    os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                ):
                     # Load from a TF 2.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                    archive_file = os.path.join(
-                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                        pretrained_model_name_or_path, TF2_WEIGHTS_NAME
+                    )
+                elif os.path.isfile(
+                    os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                ):
                     # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                    archive_file = os.path.join(
+                        pretrained_model_name_or_path, WEIGHTS_NAME
+                    )
                 else:
                     raise EnvironmentError(
                         "Error no file named {} found in directory {} or `from_tf` set to False".format(
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
+                            [
+                                WEIGHTS_NAME,
+                                TF2_WEIGHTS_NAME,
+                                TF_WEIGHTS_NAME + ".index",
+                            ],
                             pretrained_model_name_or_path,
                         )
                     )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(
+                pretrained_model_name_or_path
+            ):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                 assert (
@@ -848,7 +938,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             if resolved_archive_file == archive_file:
                 logger.info("loading weights file {}".format(archive_file))
             else:
-                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
+                logger.info(
+                    "loading weights file {} from cache at {}".format(
+                        archive_file, resolved_archive_file
+                    )
+                )
         else:
             resolved_archive_file = None
@@ -871,13 +965,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
         if from_tf:
             if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors
-                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+                model = cls.load_tf_weights(
+                    model, config, resolved_archive_file[:-6]
+                )  # Remove the '.index'
             else:
                 # Load from our TensorFlow 2.0 checkpoints
                 try:
                     from transformers import load_tf2_checkpoint_in_pytorch_model
-                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
+                    model = load_tf2_checkpoint_in_pytorch_model(
+                        model, resolved_archive_file, allow_missing_keys=True
+                    )
                 except ImportError:
                     logger.error(
                         "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
@@ -909,7 +1007,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
             # so we need to apply the function recursively.
             def load(module: nn.Module, prefix=""):
-                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+                local_metadata = (
+                    {} if metadata is None else metadata.get(prefix[:-1], {})
+                )
                 module._load_from_state_dict(
                     state_dict,
                     prefix,
@@ -926,7 +1026,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             # Make sure we are able to load base models as well as derived models (with heads)
             start_prefix = ""
             model_to_load = model
-            has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
+            has_prefix_module = any(
+                s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+            )
             if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
                 start_prefix = cls.base_model_prefix + "."
             if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
@@ -937,15 +1039,20 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             if model.__class__.__name__ != model_to_load.__class__.__name__:
                 base_model_state_dict = model_to_load.state_dict().keys()
                 head_model_state_dict_without_base_prefix = [
-                    key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
+                    key.split(cls.base_model_prefix + ".")[-1]
+                    for key in model.state_dict().keys()
                 ]
-                missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
+                missing_keys.extend(
+                    head_model_state_dict_without_base_prefix - base_model_state_dict
+                )
             # Some models may have keys that are not in the state by design, removing them before needlessly warning
             # the user.
             if cls.authorized_missing_keys is not None:
                 for pat in cls.authorized_missing_keys:
-                    missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+                    missing_keys = [
+                        k for k in missing_keys if re.search(pat, k) is None
+                    ]
             if len(unexpected_keys) > 0:
                 logger.warning(
@@ -957,7 +1064,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                     f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
                 )
             else:
-                logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+                logger.info(
+                    f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
+                )
             if len(missing_keys) > 0:
                 logger.warning(
                     f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
@@ -990,7 +1099,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
             }
             return model, loading_info
-        if hasattr(config, "xla_device") and config.xla_device and is_torch_tpu_available():
+        if (
+            hasattr(config, "xla_device")
+            and config.xla_device
+            and is_torch_tpu_available()
+        ):
             import torch_xla.core.xla_model as xm
             model = xm.send_cpu_data_to_device(model, xm.xla_device())
@@ -1039,7 +1152,9 @@ class PoolerStartLogits(nn.Module):
         self.dense = nn.Linear(config.hidden_size, 1)
     def forward(
-        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
+        self,
+        hidden_states: torch.FloatTensor,
+        p_mask: Optional[torch.FloatTensor] = None,
     ) -> torch.FloatTensor:
         """
         Args:
@@ -1112,8 +1227,12 @@ class PoolerEndLogits(nn.Module):
         ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
             slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_positions = start_positions[:, None, None].expand(
-            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+                -1, -1, hsz
+            )  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(
+                -2, start_positions
+            )  # shape (bsz, 1, hsz)
             start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
         x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
@@ -1177,12 +1296,20 @@ class PoolerAnswerClass(nn.Module):
             start_states is not None or start_positions is not None
         ), "One of start_states, start_positions should be not None"
         if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_positions = start_positions[:, None, None].expand(
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+                -1, -1, hsz
+            )  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(
+                -2
+            )  # shape (bsz, hsz)
         if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_index = cls_index[:, None, None].expand(
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+                -1, -1, hsz
+            )  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(
+                -2
+            )  # shape (bsz, hsz)
         else:
             cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
@@ -1241,7 +1368,9 @@ class SQuADHead(nn.Module):
         self.end_logits = PoolerEndLogits(config)
         self.answer_class = PoolerAnswerClass(config)
-    @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig)
+    @replace_return_docstrings(
+        output_type=SquadHeadOutput, config_class=PretrainedConfig
+    )
     def forward(
         self,
         hidden_states: torch.FloatTensor,
@@ -1281,7 +1410,9 @@ class SQuADHead(nn.Module):
                     x.squeeze_(-1)
             # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+            end_logits = self.end_logits(
+                hidden_states, start_positions=start_positions, p_mask=p_mask
+            )
             loss_fct = CrossEntropyLoss()
             start_loss = loss_fct(start_logits, start_positions)
@@ -1290,7 +1421,9 @@ class SQuADHead(nn.Module):
             if cls_index is not None and is_impossible is not None:
                 # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                cls_logits = self.answer_class(
+                    hidden_states, start_positions=start_positions, cls_index=cls_index
+                )
                 loss_fct_cls = nn.BCEWithLogitsLoss()
                 cls_loss = loss_fct_cls(cls_logits, is_impossible)
@@ -1307,28 +1440,48 @@ class SQuADHead(nn.Module):
             start_top_log_probs, start_top_index = torch.topk(
                 start_log_probs, self.start_n_top, dim=-1
             )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+                -1, -1, hsz
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+            )  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(
+                hidden_states, -2, start_top_index_exp
+            )  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(
+                -1, slen, -1, -1
+            )  # shape (bsz, slen, start_n_top, hsz)
             hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
                 start_states
             )  # shape (bsz, slen, start_n_top, hsz)
             p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_logits = self.end_logits(
-            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+                hidden_states_expanded, start_states=start_states, p_mask=p_mask
+            )
+            end_log_probs = F.softmax(
+                end_logits, dim=1
+            )  # shape (bsz, slen, start_n_top)
             end_top_log_probs, end_top_index = torch.topk(
                 end_log_probs, self.end_n_top, dim=1
             )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_log_probs = end_top_log_probs.view(
+                -1, self.start_n_top * self.end_n_top
+            )
             end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
             start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+            cls_logits = self.answer_class(
+                hidden_states, start_states=start_states, cls_index=cls_index
+            )
             if not return_dict:
-                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
+                return (
+                    start_top_log_probs,
+                    start_top_index,
+                    end_top_log_probs,
+                    end_top_index,
+                    cls_logits,
+                )
             else:
                 return SquadHeadOutput(
                     start_top_log_probs=start_top_log_probs,
@@ -1379,17 +1532,26 @@ class SequenceSummary(nn.Module):
         self.summary = Identity()
         if hasattr(config, "summary_use_proj") and config.summary_use_proj:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+            if (
+                hasattr(config, "summary_proj_to_labels")
+                and config.summary_proj_to_labels
+                and config.num_labels > 0
+            ):
                 num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
         activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
+        self.activation: Callable = get_activation(
+            activation_string
+        ) if activation_string else Identity()
         self.first_dropout = Identity()
-        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+        if (
+            hasattr(config, "summary_first_dropout")
+            and config.summary_first_dropout > 0
+        ):
             self.first_dropout = nn.Dropout(config.summary_first_dropout)
         self.last_dropout = Identity()
@@ -1397,7 +1559,9 @@ class SequenceSummary(nn.Module):
             self.last_dropout = nn.Dropout(config.summary_last_dropout)
     def forward(
-        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+        self,
+        hidden_states: torch.FloatTensor,
+        cls_index: Optional[torch.LongTensor] = None,
     ) -> torch.FloatTensor:
         """
         Compute a single vector summary of a sequence hidden states.
@@ -1427,9 +1591,13 @@ class SequenceSummary(nn.Module):
                 )
             else:
                 cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+                cls_index = cls_index.expand(
+                    (-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)
+                )
             # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+            output = hidden_states.gather(-2, cls_index).squeeze(
+                -2
+            )  # shape (bsz, XX, hidden_size)
         elif self.summary_type == "attn":
             raise NotImplementedError
@@ -1441,7 +1609,9 @@ class SequenceSummary(nn.Module):
         return output
-def prune_linear_layer(layer: torch.nn.Linear, index: torch.LongTensor, dim: int = 0) -> torch.nn.Linear:
+def prune_linear_layer(
+    layer: torch.nn.Linear, index: torch.LongTensor, dim: int = 0
+) -> torch.nn.Linear:
     """
     Prune a linear layer to keep only entries in index.
@@ -1464,7 +1634,9 @@ def prune_linear_layer(layer: torch.nn.Linear, index: torch.LongTensor, dim: int
             b = layer.bias[index].clone().detach()
     new_size = list(layer.weight.size())
     new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(
+        layer.weight.device
+    )
     new_layer.weight.requires_grad = False
     new_layer.weight.copy_(W.contiguous())
     new_layer.weight.requires_grad = True
@@ -1509,7 +1681,9 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> 
 def prune_layer(
-    layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None
+    layer: Union[torch.nn.Linear, Conv1D],
+    index: torch.LongTensor,
+    dim: Optional[int] = None,
 ) -> Union[torch.nn.Linear, Conv1D]:
     """
     Prune a Conv1D or linear layer to keep only entries in index.
@@ -1534,7 +1708,10 @@ def prune_layer(
 def apply_chunking_to_forward(
-    forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
+    forward_fn: Callable[..., torch.Tensor],
+    chunk_size: int,
+    chunk_dim: int,
+    *input_tensors,
 ) -> torch.Tensor:
     """
     This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
@@ -1568,7 +1745,9 @@ def apply_chunking_to_forward(
             return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
     """
-    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors)
+    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(
+        input_tensors
+    )
     tensor_shape = input_tensors[0].shape
     assert all(
         input_tensor.shape == tensor_shape for input_tensor in input_tensors
@@ -1592,9 +1771,15 @@ def apply_chunking_to_forward(
         num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
         # chunk input tensor into tuples
-        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
+        input_tensors_chunks = tuple(
+            input_tensor.chunk(num_chunks, dim=chunk_dim)
+            for input_tensor in input_tensors
+        )
         # apply forward fn to every tuple
-        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        output_chunks = tuple(
+            forward_fn(*input_tensors_chunk)
+            for input_tensors_chunk in zip(*input_tensors_chunks)
+        )
         # concatenate output at same dimension
         return torch.cat(output_chunks, dim=chunk_dim)
--- a/train/utils.py
View file @9b9ed4f
+++ b/train/utils.py
View file @9b9ed4f
@@ -39,9 +39,13 @@ def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
     return loss, nll_loss
-def encode_line(tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
+def encode_line(
+    tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"
+):
     """Only used by LegacyDataset"""
-    extra_kw = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
+    extra_kw = (
+        {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
+    )
     return tokenizer(
         [line],
         max_length=max_length,
@@ -63,9 +67,7 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
 def trim_batch(
-    input_ids,
+    input_ids, pad_token_id, attention_mask=None,
-    pad_token_id,
-    attention_mask=None,
 ):
     """Remove columns that are populated exclusively by pad_token_id"""
     keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
@@ -125,7 +127,9 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
     def __getitem__(self, index) -> Dict[str, torch.Tensor]:
         """Call tokenizer on src and tgt_lines"""
         index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip(
+            "\n"
+        )
         tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
         assert source_line, f"empty source line for index {index}"
         assert tgt_line, f"empty tgt line for index {index}"
@@ -147,7 +151,9 @@ class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
         target_ids = torch.stack([x["labels"] for x in batch])
         pad_token_id = self.pad_token_id
         y = trim_batch(target_ids, pad_token_id)
-        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
+        source_ids, source_mask = trim_batch(
+            input_ids, pad_token_id, attention_mask=masks
+        )
         batch = {
             "input_ids": source_ids,
             "attention_mask": source_mask,
@@ -161,7 +167,9 @@ class Seq2SeqDataset(AbstractSeq2SeqDataset):
     def __getitem__(self, index) -> Dict[str, str]:
         index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip(
+            "\n"
+        )
         tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
         assert source_line, f"empty source line for index {index}"
         assert tgt_line, f"empty tgt line for index {index}"
@@ -201,12 +209,23 @@ class SortishSampler(Sampler):
         idxs = np.random.permutation(len(self.data))
         sz = self.bs * 50
         ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
-        sort_idx = np.concatenate([sorted(s, key=self.key, reverse=True) for s in ck_idx])
+        sort_idx = np.concatenate(
+            [sorted(s, key=self.key, reverse=True) for s in ck_idx]
+        )
         sz = self.bs
         ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
-        max_ck = np.argmax([self.key(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
+        max_ck = np.argmax(
-        ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
+            [self.key(ck[0]) for ck in ck_idx]
-        sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=np.int)
+        )  # find the chunk with the largest key,
+        ck_idx[0], ck_idx[max_ck] = (
+            ck_idx[max_ck],
+            ck_idx[0],
+        )  # then make sure it goes first.
+        sort_idx = (
+            np.concatenate(np.random.permutation(ck_idx[1:]))
+            if len(ck_idx) > 1
+            else np.array([], dtype=np.int)
+        )
         sort_idx = np.concatenate((ck_idx[0], sort_idx))
         return iter(sort_idx)
@@ -269,7 +288,9 @@ def get_git_info():
 ROUGE_KEYS = ["rouge1", "rouge2", "rougeL"]
-def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer=True) -> Dict:
+def calculate_rouge(
+    output_lns: List[str], reference_lns: List[str], use_stemmer=True
+) -> Dict:
     scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
     aggregator = scoring.BootstrapAggregator()
@@ -302,7 +323,9 @@ def assert_all_frozen(model):
     model_grads: List[bool] = list(grad_status(model))
     n_require_grad = sum(lmap(int, model_grads))
     npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+    assert not any(
+        model_grads
+    ), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
 def assert_not_all_frozen(model):