commit_suggester.py
3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Copyright 2020-present Tae Hwan Jung
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import argparse
import subprocess
from transformers import AutoTokenizer
from preprocess import diff_parse, truncate
from train import BartForConditionalGeneration
def get_length(chunks):
cnt = 0
for chunk in chunks:
cnt += len(chunk)
return cnt
def suggester(chunks, model, tokenizer, device):
max_source_length = get_length(chunks)
input_ids, attention_masks, patch_ids = zip(*chunks)
input_ids = torch.LongTensor(
[truncate(input_ids, max_source_length, value=0)]
).to(device)
attention_masks = torch.LongTensor(
[truncate(attention_masks, max_source_length, value=1)]
).to(device)
patch_ids = torch.LongTensor(
[truncate(patch_ids, max_source_length, value=0)]
).to(device)
summaries = model.generate(
input_ids=input_ids, patch_ids=patch_ids, attention_mask=attention_masks
)
return tokenizer.batch_decode(
summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
def main(args):
device = torch.device(
"cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
)
model = BartForConditionalGeneration.from_pretrained(args.output_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
if args.unittest:
with open("test.source", "r") as f:
chunks = diff_parse(f.read(), tokenizer)
else:
proc = subprocess.Popen(["git", "diff", "--cached"], stdout=subprocess.PIPE)
staged_files = proc.stdout.readlines()
staged_files = [f.decode("utf-8") for f in staged_files]
staged_files = [f.strip() for f in staged_files]
chunks = "\n".join(staged_files)
chunks = diff_parse(chunks, tokenizer)
if not chunks:
print('There is no file in staged state.')
return
commit_message = suggester(
chunks,
model=model,
tokenizer=tokenizer,
device=device,
)
print(commit_message)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Code to collect commits on github")
parser.add_argument(
"--no_cuda", action="store_true", help="Whether not to use CUDA when available"
)
parser.add_argument(
"--unittest", action="store_true", help="Unittest with an one batch git diff"
)
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--tokenizer_name",
default="sshleifer/distilbart-xsum-6-6",
type=str,
help="Pretrained tokenizer name or path if not the same as model_name",
)
args = parser.parse_args()
main(args)