(add) unittest for api

graykode
Commit 042ef27aaccc096d3fbc0110847f4e0479f86551 042ef27a 1 parent 4c9d9868
Showing 2 changed files with 173 additions and 0 deletions
src/api.py
src/test.source
--- a/src/api.py 0 → 100644
View file @042ef27
+++ b/src/api.py 0 → 100644
View file @042ef27
+ # Copyright 2020-present Tae Hwan Jung
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ 
+ import os
+ import torch
+ import logging
+ from tqdm import tqdm
+ import torch.nn as nn
+ from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
+ from transformers import (RobertaConfig, RobertaTokenizer)
+ 
+ import argparse
+ import whatthepatch
+ from train.run import (Example, convert_examples_to_features)
+ from train.model import Seq2Seq
+ from train.customized_roberta import RobertaModel
+ 
+ MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
+ 
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                     datefmt = '%m/%d/%Y %H:%M:%S',
+                     level = logging.INFO)
+ logger = logging.getLogger(__name__)
+ 
+ def create_examples(diff, tokenizer):
+     examples = []
+     for idx, example in enumerate(whatthepatch.parse_patch(diff)):
+         added, deleted = [], []
+         for change in example.changes:
+             if change.old == None and change.new != None:
+                 added.extend(tokenizer.tokenize(change.line))
+             elif change.old != None and change.new == None:
+                 deleted.extend(tokenizer.tokenize(change.line))
+         examples.append(
+             Example(
+                 idx=idx,
+                 added=added,
+                 deleted=deleted,
+                 target=None
+             )
+         )
+ 
+     return examples
+ 
+ def main(args):
+ 
+     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+     config = config_class.from_pretrained(args.config_name)
+     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, do_lower_case=args.do_lower_case)
+ 
+     # budild model
+     encoder = model_class(config=config)
+     decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
+     decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+     model = Seq2Seq(encoder=encoder, decoder=decoder, config=config,
+                     beam_size=args.beam_size, max_length=args.max_target_length,
+                     sos_id=tokenizer.cls_token_id, eos_id=tokenizer.sep_token_id)
+     if args.load_model_path is not None:
+         logger.info("reload model from {}".format(args.load_model_path))
+         model.load_state_dict(torch.load(args.load_model_path), strict=False)
+ 
+     model.to(args.device)
+     with open("test.source", "r") as f:
+         eval_examples = create_examples(f.read(), tokenizer)
+ 
+     test_features = convert_examples_to_features(eval_examples, tokenizer, args, stage='test')
+     all_source_ids = torch.tensor([f.source_ids for f in test_features], dtype=torch.long)
+     all_source_mask = torch.tensor([f.source_mask for f in test_features], dtype=torch.long)
+     all_patch_ids = torch.tensor([f.patch_ids for f in test_features], dtype=torch.long)
+     test_data = TensorDataset(all_source_ids, all_source_mask, all_patch_ids)
+ 
+     # Calculate bleu
+     eval_sampler = SequentialSampler(test_data)
+     eval_dataloader = DataLoader(test_data, sampler=eval_sampler, batch_size=len(test_data))
+ 
+     model.eval()
+     for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
+         batch = tuple(t.to(args.device) for t in batch)
+         source_ids, source_mask, patch_ids = batch
+         with torch.no_grad():
+             preds = model(source_ids=source_ids, source_mask=source_mask, patch_ids=patch_ids)
+             for pred in preds:
+                 t = pred[0].cpu().numpy()
+                 t = list(t)
+                 if 0 in t:
+                     t = t[:t.index(0)]
+                 text = tokenizer.decode(t, clean_up_tokenization_spaces=False)
+                 print(text)
+ 
+ 
+ if __name__ == '__main__':
+     parser = argparse.ArgumentParser(description="")
+     parser.add_argument("--load_model_path", default=None, type=str, required=True,
+                         help="Path to trained model: Should contain the .bin files")
+ 
+     parser.add_argument("--model_type", default='roberta', type=str,
+                         help="Model type: e.g. roberta")
+     parser.add_argument("--config_name", default="microsoft/codebert-base", type=str,
+                         help="Pretrained config name or path if not the same as model_name")
+     parser.add_argument("--tokenizer_name", type=str,
+                         default="microsoft/codebert-base", help="The name of tokenizer", )
+     parser.add_argument("--max_source_length", default=256, type=int,
+                         help="The maximum total source sequence length after tokenization. Sequences longer "
+                              "than this will be truncated, sequences shorter will be padded.")
+     parser.add_argument("--max_target_length", default=128, type=int,
+                         help="The maximum total target sequence length after tokenization. Sequences longer "
+                              "than this will be truncated, sequences shorter will be padded.")
+     parser.add_argument("--beam_size", default=10, type=int,
+                         help="beam size for beam search")
+     parser.add_argument("--do_lower_case", action='store_true',
+                         help="Set this flag if you are using an uncased model.")
+     parser.add_argument("--no_cuda", action='store_true',
+                         help="Avoid using CUDA when available")
+ 
+     args = parser.parse_args()
+ 
+     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+ 
+     main(args)
\ No newline at end of file
--- a/src/test.source 0 → 100644
View file @042ef27
+++ b/src/test.source 0 → 100644
View file @042ef27
+ diff --git a/src/train/model.py b/src/train/model.py
+ index 20e56b3..cab82e5 100644
+ --- a/src/train/model.py
+ +++ b/src/train/model.py
+ @@ -3,9 +3,7 @@
+  
+  import torch
+  import torch.nn as nn
+ -import torch
+ -from torch.autograd import Variable
+ -import copy
+ +
+  class Seq2Seq(nn.Module):
+      """
+          Build Seqence-to-Sequence.
+ diff --git a/src/train/run.py b/src/train/run.py
+ index 5961ad1..be98fec 100644
+ --- a/src/train/run.py
+ +++ b/src/train/run.py
+ @@ -22,7 +22,6 @@ using a masked language modeling (MLM) loss.
+  from __future__ import absolute_import
+  import os
+  import sys
+ -import bleu
+  import pickle
+  import torch
+  import json
+ @@ -35,11 +34,14 @@ from itertools import cycle
+  import torch.nn as nn
+  from model import Seq2Seq
+  from tqdm import tqdm, trange
+ -from customized_roberta import RobertaModel
+  from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
+  from torch.utils.data.distributed import DistributedSampler
+  from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
+                            RobertaConfig, RobertaTokenizer)
+ +
+ +import train.bleu as bleu
+ +from train.customized_roberta import RobertaModel
+ +
+  MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
+  
+  logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',