(remove) legacy commit suggester

graykode
Commit f9e5ae186331648d56790a8e7088860c1ee2967a f9e5ae18 1 parent 3f92ebb7
Showing 13 changed files with 0 additions and 553 deletions
commit_suggester.py
preprocess/__init__.py
preprocess/gitcommit.py
test.source
train.py
train/__init__.py
train/callbacks.py
train/finetune.py
train/generation_utils.py
train/lightning_base.py
train/modeling_bart.py
train/modeling_utils.py
train/utils.py
--- a/commit_suggester.py deleted 100644 → 0
View file @3f92ebb
+++ b/commit_suggester.py deleted 100644 → 0
View file @3f92ebb
- # Copyright 2020-present Tae Hwan Jung
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- #     http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- 
- import torch
- import argparse
- import subprocess
- from transformers import AutoTokenizer
- 
- from preprocess import diff_parse, truncate
- from train import BartForConditionalGeneration
- 
- def get_length(chunks):
-     cnt = 0
-     for chunk in chunks:
-         cnt += len(chunk)
-     return cnt
- 
- def suggester(chunks, model, tokenizer, device):
-     max_source_length = get_length(chunks)
- 
-     input_ids, attention_masks, patch_ids = zip(*chunks)
-     input_ids = torch.LongTensor(
-         [truncate(input_ids, max_source_length, value=0)]
-     ).to(device)
-     attention_masks = torch.LongTensor(
-         [truncate(attention_masks, max_source_length, value=1)]
-     ).to(device)
-     patch_ids = torch.LongTensor(
-         [truncate(patch_ids, max_source_length, value=0)]
-     ).to(device)
- 
-     summaries = model.generate(
-         input_ids=input_ids, patch_ids=patch_ids, attention_mask=attention_masks
-     )
-     return tokenizer.batch_decode(
-         summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False
-     )
- 
- 
- def main(args):
-     device = torch.device(
-         "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
-     )
-     model = BartForConditionalGeneration.from_pretrained(args.output_dir).to(device)
- 
-     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
- 
-     if args.unittest:
-         with open("test.source", "r") as f:
-             chunks = diff_parse(f.read(), tokenizer)
-     else:
-         proc = subprocess.Popen(["git", "diff", "--cached"], stdout=subprocess.PIPE)
-         staged_files = proc.stdout.readlines()
-         staged_files = [f.decode("utf-8") for f in staged_files]
-         staged_files = [f.strip() for f in staged_files]
-         chunks = "\n".join(staged_files)
- 
-     chunks = diff_parse(chunks, tokenizer)
-     if not chunks:
-         print('There is no file in staged state.')
-         return
- 
-     commit_message = suggester(
-         chunks,
-         model=model,
-         tokenizer=tokenizer,
-         device=device,
-     )
-     print(commit_message)
- 
- 
- if __name__ == "__main__":
-     parser = argparse.ArgumentParser(description="Code to collect commits on github")
-     parser.add_argument(
-         "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
-     )
-     parser.add_argument(
-         "--unittest", action="store_true", help="Unittest with an one batch git diff"
-     )
-     parser.add_argument(
-         "--output_dir",
-         type=str,
-         required=True,
-         help="The output directory where the model predictions and checkpoints will be written.",
-     )
-     parser.add_argument(
-         "--tokenizer_name",
-         default="sshleifer/distilbart-xsum-6-6",
-         type=str,
-         help="Pretrained tokenizer name or path if not the same as model_name",
-     )
-     args = parser.parse_args()
- 
-     main(args)
--- a/preprocess/__init__.py deleted 100644 → 0
View file @3f92ebb
+++ b/preprocess/__init__.py deleted 100644 → 0
View file @3f92ebb
- # Copyright 2020-present Tae Hwan Jung
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- #     http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- 
- from .gitcommit import diff_parse, truncate
- 
- __all__ = [
-     "diff_parse",
-     "truncate",
- ]
--- a/preprocess/gitcommit.py deleted 100644 → 0
View file @3f92ebb
+++ b/preprocess/gitcommit.py deleted 100644 → 0
View file @3f92ebb
- # Copyright 2020-present Tae Hwan Jung
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- #     http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- 
- import os
- import re
- import enum
- import random
- import logging
- import tempfile
- import argparse
- import numpy as np
- from tqdm import *
- import whatthepatch
- from git import Repo
- from functools import partial
- from multiprocessing.pool import Pool
- from transformers import AutoTokenizer
- 
- from matorage import *
- 
- logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
- logging.basicConfig(
-     format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-     datefmt="%m/%d/%Y %H:%M:%S",
-     level=logging.INFO,
- )
- 
- 
- class PATCH(enum.Enum):
-     PLUS = 1
-     MINUS = 2
- 
- 
- def truncate(tuple, max_length, value=0):
-     ls = []
-     for t in tuple:
-         if isinstance(t, int):
-             t = [t]
-         ls.extend(t)
-     ls = ls[: max_length - 1]
-     ls.insert(0, value)
-     if len(ls) < max_length:
-         ls.extend([0] * (max_length - len(ls)))
-     assert len(ls) == max_length
-     return ls
- 
- 
- def encode_line(tokenizer, line, patch):
-     line = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", line).strip()
-     tokens = tokenizer.tokenize(line)
-     tokens = tokenizer.convert_tokens_to_ids(tokens)
-     return (tokens, [1] * len(tokens), len(tokens) * [patch.value])
- 
- 
- def diff_parse(diff, tokenizer):
-     chunks = []
-     for diff in whatthepatch.parse_patch(diff):
-         if diff.header.old_path != diff.header.new_path:
-             chunks.append(encode_line(tokenizer, diff.header.old_path, PATCH.MINUS))
-             chunks.append(encode_line(tokenizer, diff.header.new_path, PATCH.PLUS))
-         if not diff.changes:
-             continue
-         for change in diff.changes:
-             if change.old == None and change.new != None:
-                 chunks.append(encode_line(tokenizer, change.line, PATCH.PLUS))
-             elif change.old != None and change.new == None:
-                 chunks.append(encode_line(tokenizer, change.line, PATCH.MINUS))
-     return chunks
- 
- 
- def sha_parse(sha, tokenizer, max_length=1024):
- 
-     chunks = diff_parse(diff=repo.git.show(sha), tokenizer=tokenizer)
-     if not chunks:
-         return None
- 
-     input_ids, attention_masks, patch_ids = zip(*chunks)
-     input_ids = truncate(input_ids, max_length, value=0)
-     attention_masks = truncate(attention_masks, max_length, value=1)
-     patch_ids = truncate(patch_ids, max_length, value=0)
- 
-     return (input_ids, attention_masks, patch_ids)
- 
- 
- def message_parse(msg, tokenizer, max_length=56):
-     msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
- 
-     msg = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", msg).strip()
-     msg = tokenizer.tokenize(msg)
-     msg = tokenizer.convert_tokens_to_ids(msg)
-     msg = truncate(msg, max_length, value=0)
- 
-     return msg
- 
- 
- def jobs(sha_msgs, args, data_config, train=True):
- 
-     input_ids, attention_masks, patch_ids, targets = [], [], [], []
-     data_saver = DataSaver(config=data_config)
- 
-     for sha_msg in sha_msgs:
-         sha, msg = sha_msg
- 
-         source = sha_parse(
-             sha, tokenizer=args.tokenizer, max_length=args.max_source_length
-         )
-         if not source:
-             continue
-         input_id, attention_mask, patch_id = source
-         target = message_parse(
-             msg,
-             tokenizer=args.tokenizer,
-             max_length=(
-                 args.max_target_length if train else args.val_max_target_length
-             ),
-         )
- 
-         input_ids.append(input_id)
-         attention_masks.append(attention_mask)
-         patch_ids.append(patch_id)
-         targets.append(target)
- 
-     data_saver(
-         {
-             "input_ids": np.asarray(input_ids),
-             "attention_masks": np.asarray(attention_masks),
-             "patch_ids": np.asarray(patch_ids),
-             "targets": np.asarray(targets),
-         }
-     )
-     data_saver.disconnect()
- 
- 
- def start(chunked_sha_msgs, train=True):
- 
-     logger.info(f"Start %s pre-processing" % ("training" if train else "evaluation"))
- 
-     max_target_length = args.max_target_length if train else args.val_max_target_length
- 
-     data_config = DataConfig(
-         endpoint=args.endpoint,
-         access_key=os.environ["access_key"],
-         secret_key=os.environ["secret_key"],
-         region=args.region,
-         dataset_name="commit-autosuggestions",
-         additional={
-             "mode": ("training" if train else "evaluation"),
-             "max_source_length": args.max_source_length,
-             "max_target_length": max_target_length,
-             "url": args.url,
-         },
-         attributes=[
-             ("input_ids", "int32", (args.max_source_length,)),
-             ("attention_masks", "int32", (args.max_source_length,)),
-             ("patch_ids", "int32", (args.max_source_length,)),
-             ("targets", "int32", (max_target_length,)),
-         ],
-     )
- 
-     func = partial(jobs, args=args, data_config=data_config, train=train)
-     with Pool(processes=args.num_workers) as pool:
-         with tqdm(total=len(chunked_sha_msgs)) as pbar:
-             for i, _ in tqdm(enumerate(pool.imap_unordered(func, chunked_sha_msgs))):
-                 pbar.update()
- 
- 
- def main(args):
-     if "access_key" not in os.environ or "secret_key" not in os.environ:
-         raise OSError("access_key or secret_key are not found.")
- 
-     sha_msgs = [(c.hexsha, c.summary) for c in repo.iter_commits()]
-     random.shuffle(sha_msgs)
-     chunked_sha_msgs = [
-         sha_msgs[x : x + args.matorage_batch]
-         for x in range(0, len(sha_msgs), args.matorage_batch)
-     ]
- 
-     barrier = int(len(chunked_sha_msgs) * (1 - args.p_val))
-     if args.do_train:
-         start(chunked_sha_msgs[:barrier], train=True)
-     if args.do_predict:
-         start(chunked_sha_msgs[barrier:], train=False)
- 
- 
- if __name__ == "__main__":
-     parser = argparse.ArgumentParser(description="Code to collect commits on github")
-     parser.add_argument("--url", type=str, required=True, help="github url")
-     parser.add_argument(
-         "--endpoint",
-         type=str,
-         required=True,
-         help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-     )
-     parser.add_argument(
-         "--region",
-         type=str,
-         default=None,
-         help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-     )
-     parser.add_argument(
-         "--tokenizer_name",
-         default="sshleifer/distilbart-xsum-6-6",
-         type=str,
-         help="Pretrained tokenizer name or path if not the same as model_name",
-     )
-     parser.add_argument(
-         "--matorage_batch",
-         default=1024,
-         type=int,
-         help="The smallest batch size stored atomically in matorage.",
-     )
-     parser.add_argument(
-         "--num_workers", default=4, type=int, help="number of process",
-     )
-     parser.add_argument(
-         "--max_source_length",
-         default=1024,
-         type=int,
-         help="The maximum total input sequence length after tokenization. Sequences longer "
-         "than this will be truncated, sequences shorter will be padded.",
-     )
-     parser.add_argument(
-         "--max_target_length",
-         default=56,
-         type=int,
-         help="The maximum total input sequence length after tokenization. Sequences longer "
-         "than this will be truncated, sequences shorter will be padded.",
-     )
-     parser.add_argument(
-         "--val_max_target_length",
-         default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
-         type=int,
-         help="The maximum total input sequence length after tokenization. Sequences longer "
-         "than this will be truncated, sequences shorter will be padded.",
-     )
-     parser.add_argument(
-         "--p_val", type=float, default=0.25, help="percent of validation dataset"
-     )
-     parser.add_argument("--do_train", action="store_true", default=False)
-     parser.add_argument("--do_predict", action="store_true", default=False)
-     args = parser.parse_args()
- 
-     args.local_path = args.url.split("/")[-1]
-     logger.info(f"master branch of {args.url} will be downloaded to {args.local_path}")
-     repo = (
-         Repo(args.local_path)
-         if os.path.exists(args.local_path)
-         else Repo.clone_from(args.url, to_path=args.local_path, branch="master")
-     )
-     args.tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
- 
-     main(args)
--- a/test.source deleted 100644 → 0
View file @3f92ebb
+++ b/test.source deleted 100644 → 0
View file @3f92ebb
--- a/train.py deleted 100644 → 0
View file @3f92ebb
+++ b/train.py deleted 100644 → 0
View file @3f92ebb
- # Copyright 2020-present Tae Hwan Jung
- # 
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- # 
- #     http://www.apache.org/licenses/LICENSE-2.0
- # 
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- 
- import os
- import argparse
- import pytorch_lightning as pl
- from train.finetune import main, SummarizationModule
- 
- if __name__ == "__main__":
-     parser = argparse.ArgumentParser()
-     parser = pl.Trainer.add_argparse_args(parser)
-     parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
- 
-     args = parser.parse_args()
- 
-     main(args)
\ No newline at end of file
--- a/train/__init__.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/__init__.py deleted 100644 → 0
View file @3f92ebb
- # Copyright 2020-present Tae Hwan Jung
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- #     http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- 
- from train.modeling_bart import BartForConditionalGeneration
- 
- __all__ = ["BartForConditionalGeneration"]
--- a/train/callbacks.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/callbacks.py deleted 100644 → 0
View file @3f92ebb
- import logging
- import os
- from pathlib import Path
- 
- import numpy as np
- import pytorch_lightning as pl
- import torch
- from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
- from pytorch_lightning.utilities import rank_zero_only
- 
- 
- def count_trainable_parameters(model):
-     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-     params = sum([np.prod(p.size()) for p in model_parameters])
-     return params
- 
- 
- logger = logging.getLogger(__name__)
- 
- 
- class Seq2SeqLoggingCallback(pl.Callback):
-     def on_batch_end(self, trainer, pl_module):
-         lrs = {
-             f"lr_group_{i}": param["lr"]
-             for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)
-         }
-         pl_module.logger.log_metrics(lrs)
- 
-     @rank_zero_only
-     def _write_logs(
-         self,
-         trainer: pl.Trainer,
-         pl_module: pl.LightningModule,
-         type_path: str,
-         save_generations=True,
-     ) -> None:
-         logger.info(
-             f"***** {type_path} results at step {trainer.global_step:05d} *****"
-         )
-         metrics = trainer.callback_metrics
-         trainer.logger.log_metrics(
-             {
-                 k: v
-                 for k, v in metrics.items()
-                 if k not in ["log", "progress_bar", "preds"]
-             }
-         )
-         # Log results
-         od = Path(pl_module.hparams.output_dir)
-         if type_path == "test":
-             results_file = od / "test_results.txt"
-             generations_file = od / "test_generations.txt"
-         else:
-             # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
-             # If people want this it will be easy enough to add back.
-             results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-             generations_file = (
-                 od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
-             )
-             results_file.parent.mkdir(exist_ok=True)
-             generations_file.parent.mkdir(exist_ok=True)
-         with open(results_file, "a+") as writer:
-             for key in sorted(metrics):
-                 if key in ["log", "progress_bar", "preds"]:
-                     continue
-                 val = metrics[key]
-                 if isinstance(val, torch.Tensor):
-                     val = val.item()
-                 msg = f"{key}: {val:.6f}\n"
-                 writer.write(msg)
- 
-         if not save_generations:
-             return
- 
-         if "preds" in metrics:
-             content = "\n".join(metrics["preds"])
-             generations_file.open("w+").write(content)
- 
-     @rank_zero_only
-     def on_train_start(self, trainer, pl_module):
-         try:
-             npars = pl_module.model.model.num_parameters()
-         except AttributeError:
-             npars = pl_module.model.num_parameters()
- 
-         n_trainable_pars = count_trainable_parameters(pl_module)
-         # mp stands for million parameters
-         trainer.logger.log_metrics(
-             {"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6}
-         )
- 
-     @rank_zero_only
-     def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-         return self._write_logs(trainer, pl_module, "test")
- 
- 
- def get_checkpoint_callback(output_dir, metric):
-     """Saves the best model by validation ROUGE2 score."""
-     if metric == "rouge2":
-         exp = "{val_avg_rouge2:.4f}-{step_count}"
-     elif metric == "bleu":
-         exp = "{val_avg_bleu:.4f}-{step_count}"
-     else:
-         raise NotImplementedError(
-             f"seq2seq callbacks only support rouge2 and bleu, got {metric}, You can make your own by adding to this function."
-         )
- 
-     checkpoint_callback = ModelCheckpoint(
-         filepath=os.path.join(output_dir, exp),
-         monitor=f"val_{metric}",
-         mode="max",
-         save_top_k=1,
-         period=0,  # maybe save a checkpoint every time val is run, not just end of epoch.
-     )
-     return checkpoint_callback
- 
- 
- def get_early_stopping_callback(metric, patience):
-     return EarlyStopping(
-         monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,
-     )
--- a/train/finetune.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/finetune.py deleted 100644 → 0
View file @3f92ebb
--- a/train/generation_utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/generation_utils.py deleted 100644 → 0
View file @3f92ebb
--- a/train/lightning_base.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/lightning_base.py deleted 100644 → 0
View file @3f92ebb
--- a/train/modeling_bart.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/modeling_bart.py deleted 100644 → 0
View file @3f92ebb
--- a/train/modeling_utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/modeling_utils.py deleted 100644 → 0
View file @3f92ebb
--- a/train/utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/utils.py deleted 100644 → 0
View file @3f92ebb