(remove) legacy commit suggester

graykode
Commit f9e5ae186331648d56790a8e7088860c1ee2967a f9e5ae18 1 parent 3f92ebb7
Showing 13 changed files with 0 additions and 553 deletions
commit_suggester.py
preprocess/__init__.py
preprocess/gitcommit.py
test.source
train.py
train/__init__.py
train/callbacks.py
train/finetune.py
train/generation_utils.py
train/lightning_base.py
train/modeling_bart.py
train/modeling_utils.py
train/utils.py
--- a/commit_suggester.py deleted 100644 → 0
View file @3f92ebb
+++ b/commit_suggester.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import argparse
-import subprocess
-from transformers import AutoTokenizer
-
-from preprocess import diff_parse, truncate
-from train import BartForConditionalGeneration
-
-def get_length(chunks):
-    cnt = 0
-    for chunk in chunks:
-        cnt += len(chunk)
-    return cnt
-
-def suggester(chunks, model, tokenizer, device):
-    max_source_length = get_length(chunks)
-
-    input_ids, attention_masks, patch_ids = zip(*chunks)
-    input_ids = torch.LongTensor(
-        [truncate(input_ids, max_source_length, value=0)]
-    ).to(device)
-    attention_masks = torch.LongTensor(
-        [truncate(attention_masks, max_source_length, value=1)]
-    ).to(device)
-    patch_ids = torch.LongTensor(
-        [truncate(patch_ids, max_source_length, value=0)]
-    ).to(device)
-
-    summaries = model.generate(
-        input_ids=input_ids, patch_ids=patch_ids, attention_mask=attention_masks
-    )
-    return tokenizer.batch_decode(
-        summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-
-
-def main(args):
-    device = torch.device(
-        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
-    )
-    model = BartForConditionalGeneration.from_pretrained(args.output_dir).to(device)
-
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-
-    if args.unittest:
-        with open("test.source", "r") as f:
-            chunks = diff_parse(f.read(), tokenizer)
-    else:
-        proc = subprocess.Popen(["git", "diff", "--cached"], stdout=subprocess.PIPE)
-        staged_files = proc.stdout.readlines()
-        staged_files = [f.decode("utf-8") for f in staged_files]
-        staged_files = [f.strip() for f in staged_files]
-        chunks = "\n".join(staged_files)
-
-    chunks = diff_parse(chunks, tokenizer)
-    if not chunks:
-        print('There is no file in staged state.')
-        return
-
-    commit_message = suggester(
-        chunks,
-        model=model,
-        tokenizer=tokenizer,
-        device=device,
-    )
-    print(commit_message)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Code to collect commits on github")
-    parser.add_argument(
-        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
-    )
-    parser.add_argument(
-        "--unittest", action="store_true", help="Unittest with an one batch git diff"
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="sshleifer/distilbart-xsum-6-6",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    args = parser.parse_args()
-
-    main(args)
--- a/preprocess/__init__.py deleted 100644 → 0
View file @3f92ebb
+++ b/preprocess/__init__.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .gitcommit import diff_parse, truncate
-
-__all__ = [
-    "diff_parse",
-    "truncate",
-]
--- a/preprocess/gitcommit.py deleted 100644 → 0
View file @3f92ebb
+++ b/preprocess/gitcommit.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import enum
-import random
-import logging
-import tempfile
-import argparse
-import numpy as np
-from tqdm import *
-import whatthepatch
-from git import Repo
-from functools import partial
-from multiprocessing.pool import Pool
-from transformers import AutoTokenizer
-
-from matorage import *
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-
-class PATCH(enum.Enum):
-    PLUS = 1
-    MINUS = 2
-
-
-def truncate(tuple, max_length, value=0):
-    ls = []
-    for t in tuple:
-        if isinstance(t, int):
-            t = [t]
-        ls.extend(t)
-    ls = ls[: max_length - 1]
-    ls.insert(0, value)
-    if len(ls) < max_length:
-        ls.extend([0] * (max_length - len(ls)))
-    assert len(ls) == max_length
-    return ls
-
-
-def encode_line(tokenizer, line, patch):
-    line = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", line).strip()
-    tokens = tokenizer.tokenize(line)
-    tokens = tokenizer.convert_tokens_to_ids(tokens)
-    return (tokens, [1] * len(tokens), len(tokens) * [patch.value])
-
-
-def diff_parse(diff, tokenizer):
-    chunks = []
-    for diff in whatthepatch.parse_patch(diff):
-        if diff.header.old_path != diff.header.new_path:
-            chunks.append(encode_line(tokenizer, diff.header.old_path, PATCH.MINUS))
-            chunks.append(encode_line(tokenizer, diff.header.new_path, PATCH.PLUS))
-        if not diff.changes:
-            continue
-        for change in diff.changes:
-            if change.old == None and change.new != None:
-                chunks.append(encode_line(tokenizer, change.line, PATCH.PLUS))
-            elif change.old != None and change.new == None:
-                chunks.append(encode_line(tokenizer, change.line, PATCH.MINUS))
-    return chunks
-
-
-def sha_parse(sha, tokenizer, max_length=1024):
-
-    chunks = diff_parse(diff=repo.git.show(sha), tokenizer=tokenizer)
-    if not chunks:
-        return None
-
-    input_ids, attention_masks, patch_ids = zip(*chunks)
-    input_ids = truncate(input_ids, max_length, value=0)
-    attention_masks = truncate(attention_masks, max_length, value=1)
-    patch_ids = truncate(patch_ids, max_length, value=0)
-
-    return (input_ids, attention_masks, patch_ids)
-
-
-def message_parse(msg, tokenizer, max_length=56):
-    msg = re.sub(r"(\(|)#([0-9])+(\)|)", "", msg)
-
-    msg = re.sub(r"[\u0100-\uFFFF\U00010000-\U0010FFFF]+", "", msg).strip()
-    msg = tokenizer.tokenize(msg)
-    msg = tokenizer.convert_tokens_to_ids(msg)
-    msg = truncate(msg, max_length, value=0)
-
-    return msg
-
-
-def jobs(sha_msgs, args, data_config, train=True):
-
-    input_ids, attention_masks, patch_ids, targets = [], [], [], []
-    data_saver = DataSaver(config=data_config)
-
-    for sha_msg in sha_msgs:
-        sha, msg = sha_msg
-
-        source = sha_parse(
-            sha, tokenizer=args.tokenizer, max_length=args.max_source_length
-        )
-        if not source:
-            continue
-        input_id, attention_mask, patch_id = source
-        target = message_parse(
-            msg,
-            tokenizer=args.tokenizer,
-            max_length=(
-                args.max_target_length if train else args.val_max_target_length
-            ),
-        )
-
-        input_ids.append(input_id)
-        attention_masks.append(attention_mask)
-        patch_ids.append(patch_id)
-        targets.append(target)
-
-    data_saver(
-        {
-            "input_ids": np.asarray(input_ids),
-            "attention_masks": np.asarray(attention_masks),
-            "patch_ids": np.asarray(patch_ids),
-            "targets": np.asarray(targets),
-        }
-    )
-    data_saver.disconnect()
-
-
-def start(chunked_sha_msgs, train=True):
-
-    logger.info(f"Start %s pre-processing" % ("training" if train else "evaluation"))
-
-    max_target_length = args.max_target_length if train else args.val_max_target_length
-
-    data_config = DataConfig(
-        endpoint=args.endpoint,
-        access_key=os.environ["access_key"],
-        secret_key=os.environ["secret_key"],
-        region=args.region,
-        dataset_name="commit-autosuggestions",
-        additional={
-            "mode": ("training" if train else "evaluation"),
-            "max_source_length": args.max_source_length,
-            "max_target_length": max_target_length,
-            "url": args.url,
-        },
-        attributes=[
-            ("input_ids", "int32", (args.max_source_length,)),
-            ("attention_masks", "int32", (args.max_source_length,)),
-            ("patch_ids", "int32", (args.max_source_length,)),
-            ("targets", "int32", (max_target_length,)),
-        ],
-    )
-
-    func = partial(jobs, args=args, data_config=data_config, train=train)
-    with Pool(processes=args.num_workers) as pool:
-        with tqdm(total=len(chunked_sha_msgs)) as pbar:
-            for i, _ in tqdm(enumerate(pool.imap_unordered(func, chunked_sha_msgs))):
-                pbar.update()
-
-
-def main(args):
-    if "access_key" not in os.environ or "secret_key" not in os.environ:
-        raise OSError("access_key or secret_key are not found.")
-
-    sha_msgs = [(c.hexsha, c.summary) for c in repo.iter_commits()]
-    random.shuffle(sha_msgs)
-    chunked_sha_msgs = [
-        sha_msgs[x : x + args.matorage_batch]
-        for x in range(0, len(sha_msgs), args.matorage_batch)
-    ]
-
-    barrier = int(len(chunked_sha_msgs) * (1 - args.p_val))
-    if args.do_train:
-        start(chunked_sha_msgs[:barrier], train=True)
-    if args.do_predict:
-        start(chunked_sha_msgs[barrier:], train=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Code to collect commits on github")
-    parser.add_argument("--url", type=str, required=True, help="github url")
-    parser.add_argument(
-        "--endpoint",
-        type=str,
-        required=True,
-        help="matorage endpoint, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-    )
-    parser.add_argument(
-        "--region",
-        type=str,
-        default=None,
-        help="matorage s3 region, check document of matorage: https://matorage.readthedocs.io/en/stable/storage.html",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="sshleifer/distilbart-xsum-6-6",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--matorage_batch",
-        default=1024,
-        type=int,
-        help="The smallest batch size stored atomically in matorage.",
-    )
-    parser.add_argument(
-        "--num_workers", default=4, type=int, help="number of process",
-    )
-    parser.add_argument(
-        "--max_source_length",
-        default=1024,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--max_target_length",
-        default=56,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--val_max_target_length",
-        default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument(
-        "--p_val", type=float, default=0.25, help="percent of validation dataset"
-    )
-    parser.add_argument("--do_train", action="store_true", default=False)
-    parser.add_argument("--do_predict", action="store_true", default=False)
-    args = parser.parse_args()
-
-    args.local_path = args.url.split("/")[-1]
-    logger.info(f"master branch of {args.url} will be downloaded to {args.local_path}")
-    repo = (
-        Repo(args.local_path)
-        if os.path.exists(args.local_path)
-        else Repo.clone_from(args.url, to_path=args.local_path, branch="master")
-    )
-    args.tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-
-    main(args)
--- a/test.source deleted 100644 → 0
View file @3f92ebb
+++ b/test.source deleted 100644 → 0
View file @3f92ebb
--- a/train.py deleted 100644 → 0
View file @3f92ebb
+++ b/train.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import argparse
-import pytorch_lightning as pl
-from train.finetune import main, SummarizationModule
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
-
-    args = parser.parse_args()
-
-    main(args)
\ No newline at end of file
--- a/train/__init__.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/__init__.py deleted 100644 → 0
View file @3f92ebb
-# Copyright 2020-present Tae Hwan Jung
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from train.modeling_bart import BartForConditionalGeneration
-
-__all__ = ["BartForConditionalGeneration"]
--- a/train/callbacks.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/callbacks.py deleted 100644 → 0
View file @3f92ebb
-import logging
-import os
-from pathlib import Path
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_only
-
-
-def count_trainable_parameters(model):
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    return params
-
-
-logger = logging.getLogger(__name__)
-
-
-class Seq2SeqLoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lrs = {
-            f"lr_group_{i}": param["lr"]
-            for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)
-        }
-        pl_module.logger.log_metrics(lrs)
-
-    @rank_zero_only
-    def _write_logs(
-        self,
-        trainer: pl.Trainer,
-        pl_module: pl.LightningModule,
-        type_path: str,
-        save_generations=True,
-    ) -> None:
-        logger.info(
-            f"***** {type_path} results at step {trainer.global_step:05d} *****"
-        )
-        metrics = trainer.callback_metrics
-        trainer.logger.log_metrics(
-            {
-                k: v
-                for k, v in metrics.items()
-                if k not in ["log", "progress_bar", "preds"]
-            }
-        )
-        # Log results
-        od = Path(pl_module.hparams.output_dir)
-        if type_path == "test":
-            results_file = od / "test_results.txt"
-            generations_file = od / "test_generations.txt"
-        else:
-            # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
-            # If people want this it will be easy enough to add back.
-            results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-            generations_file = (
-                od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
-            )
-            results_file.parent.mkdir(exist_ok=True)
-            generations_file.parent.mkdir(exist_ok=True)
-        with open(results_file, "a+") as writer:
-            for key in sorted(metrics):
-                if key in ["log", "progress_bar", "preds"]:
-                    continue
-                val = metrics[key]
-                if isinstance(val, torch.Tensor):
-                    val = val.item()
-                msg = f"{key}: {val:.6f}\n"
-                writer.write(msg)
-
-        if not save_generations:
-            return
-
-        if "preds" in metrics:
-            content = "\n".join(metrics["preds"])
-            generations_file.open("w+").write(content)
-
-    @rank_zero_only
-    def on_train_start(self, trainer, pl_module):
-        try:
-            npars = pl_module.model.model.num_parameters()
-        except AttributeError:
-            npars = pl_module.model.num_parameters()
-
-        n_trainable_pars = count_trainable_parameters(pl_module)
-        # mp stands for million parameters
-        trainer.logger.log_metrics(
-            {"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6}
-        )
-
-    @rank_zero_only
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        return self._write_logs(trainer, pl_module, "test")
-
-
-def get_checkpoint_callback(output_dir, metric):
-    """Saves the best model by validation ROUGE2 score."""
-    if metric == "rouge2":
-        exp = "{val_avg_rouge2:.4f}-{step_count}"
-    elif metric == "bleu":
-        exp = "{val_avg_bleu:.4f}-{step_count}"
-    else:
-        raise NotImplementedError(
-            f"seq2seq callbacks only support rouge2 and bleu, got {metric}, You can make your own by adding to this function."
-        )
-
-    checkpoint_callback = ModelCheckpoint(
-        filepath=os.path.join(output_dir, exp),
-        monitor=f"val_{metric}",
-        mode="max",
-        save_top_k=1,
-        period=0,  # maybe save a checkpoint every time val is run, not just end of epoch.
-    )
-    return checkpoint_callback
-
-
-def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(
-        monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,
-    )
--- a/train/finetune.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/finetune.py deleted 100644 → 0
View file @3f92ebb
--- a/train/generation_utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/generation_utils.py deleted 100644 → 0
View file @3f92ebb
--- a/train/lightning_base.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/lightning_base.py deleted 100644 → 0
View file @3f92ebb
--- a/train/modeling_bart.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/modeling_bart.py deleted 100644 → 0
View file @3f92ebb
--- a/train/modeling_utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/modeling_utils.py deleted 100644 → 0
View file @3f92ebb
--- a/train/utils.py deleted 100644 → 0
View file @3f92ebb
+++ b/train/utils.py deleted 100644 → 0
View file @3f92ebb