(add) commit suggester

graykode
Commit fef8c9aa3f063c1c653afd38b3b7ff06ead86a31 fef8c9aa 1 parent 98b1548a
Showing 1 changed file with 100 additions and 0 deletions
commit_suggester.py
--- a/commit_suggester.py 0 → 100644
View file @fef8c9a
+++ b/commit_suggester.py 0 → 100644
View file @fef8c9a
+# Copyright 2020-present Tae Hwan Jung
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import argparse
+import subprocess
+from transformers import AutoTokenizer
+
+from preprocess import diff_parse, truncate
+from train import BartForConditionalGeneration
+
+
+def suggester(chunks, max_source_length, model, tokenizer, device):
+    input_ids, attention_masks, patch_ids = zip(*chunks)
+    input_ids = torch.LongTensor([truncate(input_ids, max_source_length, value=0)]).to(
+        device
+    )
+    attention_masks = torch.LongTensor(
+        [truncate(attention_masks, max_source_length, value=1)]
+    ).to(device)
+    patch_ids = torch.LongTensor([truncate(patch_ids, max_source_length, value=0)]).to(
+        device
+    )
+
+    summaries = model.generate(
+        input_ids=input_ids, patch_ids=patch_ids, attention_mask=attention_masks
+    )
+    return tokenizer.batch_decode(
+        summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+
+
+def main(args):
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
+    )
+    model = BartForConditionalGeneration.from_pretrained(args.output_dir).to(device)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
+
+    if args.unittest:
+        with open("test.source", "r") as f:
+            chunks = diff_parse(f.read(), tokenizer)
+    else:
+        proc = subprocess.Popen(["git", "diff", "--cached"], stdout=subprocess.PIPE)
+        staged_files = proc.stdout.readlines()
+        staged_files = [f.decode("utf-8") for f in staged_files]
+        staged_files = [f.strip() for f in staged_files]
+        chunks = "\n".join(staged_files)
+
+    commit_message = suggester(
+        chunks,
+        max_source_length=args.max_source_length,
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+    )
+    print(commit_message)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Code to collect commits on github")
+    parser.add_argument(
+        "--no_cuda", action="store_true", help="Whether not to use CUDA when available"
+    )
+    parser.add_argument(
+        "--unittest", action="store_true", help="Unittest with an one batch git diff"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="sshleifer/distilbart-xsum-6-6",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--max_source_length",
+        default=1024,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    args = parser.parse_args()
+
+    main(args)