(add) code2nl finetuning code

graykode
Commit f552b8349497e39893ae1f13b478cdffe468872e f552b834 1 parent 3d9624b7
Showing 3 changed files with 945 additions and 0 deletions
code2nl/bleu.py
code2nl/model.py
code2nl/run.py
--- a/code2nl/bleu.py 0 → 100644
View file @f552b83
+++ b/code2nl/bleu.py 0 → 100644
View file @f552b83
+ #!/usr/bin/python
+ 
+ '''
+ This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
+ '''
+ 
+ # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+ 
+ '''Provides:
+ 
+ cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+ cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+ score_cooked(alltest, n=4): Score a list of cooked test sentences.
+ 
+ score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
+ 
+ The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
+ '''
+ 
+ import sys, math, re, xml.sax.saxutils
+ import subprocess
+ import os
+ 
+ # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+ nonorm = 0
+ 
+ preserve_case = False
+ eff_ref_len = "shortest"
+ 
+ normalize1 = [
+     ('<skipped>', ''),         # strip "skipped" tags
+     (r'-\n', ''),              # strip end-of-line hyphenation and join lines
+     (r'\n', ' '),              # join lines
+ #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+ ]
+ normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
+ 
+ normalize2 = [
+     (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
+     (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
+     (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
+     (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
+ ]
+ normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
+ 
+ def normalize(s):
+     '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
+     # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+     if (nonorm):
+         return s.split()
+     if type(s) is not str:
+         s = " ".join(s)
+     # language-independent part:
+     for (pattern, replace) in normalize1:
+         s = re.sub(pattern, replace, s)
+     s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
+     # language-dependent part (assuming Western languages):
+     s = " %s " % s
+     if not preserve_case:
+         s = s.lower()         # this might not be identical to the original
+     for (pattern, replace) in normalize2:
+         s = re.sub(pattern, replace, s)
+     return s.split()
+ 
+ def count_ngrams(words, n=4):
+     counts = {}
+     for k in range(1,n+1):
+         for i in range(len(words)-k+1):
+             ngram = tuple(words[i:i+k])
+             counts[ngram] = counts.get(ngram, 0)+1
+     return counts
+ 
+ def cook_refs(refs, n=4):
+     '''Takes a list of reference sentences for a single segment
+     and returns an object that encapsulates everything that BLEU
+     needs to know about them.'''
+     
+     refs = [normalize(ref) for ref in refs]
+     maxcounts = {}
+     for ref in refs:
+         counts = count_ngrams(ref, n)
+         for (ngram,count) in counts.items():
+             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+     return ([len(ref) for ref in refs], maxcounts)
+ 
+ def cook_test(test, item, n=4):
+     '''Takes a test sentence and returns an object that
+     encapsulates everything that BLEU needs to know about it.'''
+     (reflens, refmaxcounts)=item
+     test = normalize(test)
+     result = {}
+     result["testlen"] = len(test)
+ 
+     # Calculate effective reference sentence length.
+     
+     if eff_ref_len == "shortest":
+         result["reflen"] = min(reflens)
+     elif eff_ref_len == "average":
+         result["reflen"] = float(sum(reflens))/len(reflens)
+     elif eff_ref_len == "closest":
+         min_diff = None
+         for reflen in reflens:
+             if min_diff is None or abs(reflen-len(test)) < min_diff:
+                 min_diff = abs(reflen-len(test))
+                 result['reflen'] = reflen
+ 
+     result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
+ 
+     result['correct'] = [0]*n
+     counts = count_ngrams(test, n)
+     for (ngram, count) in counts.items():
+         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+ 
+     return result
+ 
+ def score_cooked(allcomps, n=4, ground=0, smooth=1):
+     totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+     for comps in allcomps:
+         for key in ['testlen','reflen']:
+             totalcomps[key] += comps[key]
+         for key in ['guess','correct']:
+             for k in range(n):
+                 totalcomps[key][k] += comps[key][k]
+     logbleu = 0.0
+     all_bleus = []
+     for k in range(n):
+       correct = totalcomps['correct'][k]
+       guess = totalcomps['guess'][k]
+       addsmooth = 0
+       if smooth == 1 and k > 0:
+         addsmooth = 1
+       logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
+       if guess == 0:
+         all_bleus.append(-10000000)
+       else:
+         all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
+ 
+     logbleu /= float(n)
+     all_bleus.insert(0, logbleu)
+ 
+     brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
+     for i in range(len(all_bleus)):
+       if i ==0:
+         all_bleus[i] += brevPenalty
+       all_bleus[i] = math.exp(all_bleus[i])
+     return all_bleus
+ 
+ def bleu(refs,  candidate, ground=0, smooth=1):
+     refs = cook_refs(refs)
+     test = cook_test(candidate, refs)
+     return score_cooked([test], ground=ground, smooth=smooth)
+ 
+ def splitPuncts(line):
+   return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
+ 
+ def computeMaps(predictions, goldfile):
+   predictionMap = {}
+   goldMap = {}
+   gf = open(goldfile, 'r')
+ 
+   for row in predictions:
+     cols = row.strip().split('\t')
+     if len(cols) == 1:
+       (rid, pred) = (cols[0], '') 
+     else:
+       (rid, pred) = (cols[0], cols[1]) 
+     predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+ 
+   for row in gf:
+     (rid, pred) = row.split('\t') 
+     if rid in predictionMap: # Only insert if the id exists for the method
+       if rid not in goldMap:
+         goldMap[rid] = []
+       goldMap[rid].append(splitPuncts(pred.strip().lower()))
+ 
+   sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
+   return (goldMap, predictionMap)
+ 
+ 
+ #m1 is the reference map
+ #m2 is the prediction map
+ def bleuFromMaps(m1, m2):
+   score = [0] * 5
+   num = 0.0
+ 
+   for key in m1:
+     if key in m2:
+       bl = bleu(m1[key], m2[key][0])
+       score = [ score[i] + bl[i] for i in range(0, len(bl))]
+       num += 1
+   return [s * 100.0 / num for s in score]
+ 
+ if __name__ == '__main__':
+   reference_file = sys.argv[1]
+   predictions = []
+   for row in sys.stdin:
+     predictions.append(row)
+   (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
+   print (bleuFromMaps(goldMap, predictionMap)[0])
+ 
--- a/code2nl/model.py 0 → 100644
View file @f552b83
+++ b/code2nl/model.py 0 → 100644
View file @f552b83
+ # Copyright (c) Microsoft Corporation. 
+ # Licensed under the MIT license.
+ 
+ import torch
+ import torch.nn as nn
+ import torch
+ from torch.autograd import Variable
+ import copy
+ class Seq2Seq(nn.Module):
+     """
+         Build Seqence-to-Sequence.
+         
+         Parameters:
+ 
+         * `encoder`- encoder of seq2seq model. e.g. roberta
+         * `decoder`- decoder of seq2seq model. e.g. transformer
+         * `config`- configuration of encoder model. 
+         * `beam_size`- beam size for beam search. 
+         * `max_length`- max length of target for beam search. 
+         * `sos_id`- start of symbol ids in target for beam search.
+         * `eos_id`- end of symbol ids in target for beam search. 
+     """
+     def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
+         super(Seq2Seq, self).__init__()
+         self.encoder = encoder
+         self.decoder=decoder
+         self.config=config
+         self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
+         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+         self.lsm = nn.LogSoftmax(dim=-1)
+         self.tie_weights()
+         
+         self.beam_size=beam_size
+         self.max_length=max_length
+         self.sos_id=sos_id
+         self.eos_id=eos_id
+         
+     def _tie_or_clone_weights(self, first_module, second_module):
+         """ Tie or clone module weights depending of weither we are using TorchScript or not
+         """
+         if self.config.torchscript:
+             first_module.weight = nn.Parameter(second_module.weight.clone())
+         else:
+             first_module.weight = second_module.weight
+                   
+     def tie_weights(self):
+         """ Make sure we are sharing the input and output embeddings.
+             Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+         """
+         self._tie_or_clone_weights(self.lm_head,
+                                    self.encoder.embeddings.word_embeddings)        
+         
+     def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,args=None):   
+         outputs = self.encoder(source_ids, attention_mask=source_mask)
+         encoder_output = outputs[0].permute([1,0,2]).contiguous()
+         if target_ids is not None:  
+             attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
+             tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
+             out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
+             hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
+             lm_logits = self.lm_head(hidden_states)
+             # Shift so that tokens < n predict n
+             active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
+             shift_logits = lm_logits[..., :-1, :].contiguous()
+             shift_labels = target_ids[..., 1:].contiguous()
+             # Flatten the tokens
+             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
+                             shift_labels.view(-1)[active_loss])
+ 
+             outputs = loss,loss*active_loss.sum(),active_loss.sum()
+             return outputs
+         else:
+             #Predict 
+             preds=[]       
+             zero=torch.cuda.LongTensor(1).fill_(0)     
+             for i in range(source_ids.shape[0]):
+                 context=encoder_output[:,i:i+1]
+                 context_mask=source_mask[i:i+1,:]
+                 beam = Beam(self.beam_size,self.sos_id,self.eos_id)
+                 input_ids=beam.getCurrentState()
+                 context=context.repeat(1, self.beam_size,1)
+                 context_mask=context_mask.repeat(self.beam_size,1)
+                 for _ in range(self.max_length): 
+                     if beam.done():
+                         break
+                     attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
+                     tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
+                     out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
+                     out = torch.tanh(self.dense(out))
+                     hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
+                     out = self.lsm(self.lm_head(hidden_states)).data
+                     beam.advance(out)
+                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
+                     input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
+                 hyp= beam.getHyp(beam.getFinal())
+                 pred=beam.buildTargetTokens(hyp)[:self.beam_size]
+                 pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
+                 preds.append(torch.cat(pred,0).unsqueeze(0))
+                 
+             preds=torch.cat(preds,0)                
+             return preds   
+         
+         
+ 
+ class Beam(object):
+     def __init__(self, size,sos,eos):
+         self.size = size
+         self.tt = torch.cuda
+         # The score for each translation on the beam.
+         self.scores = self.tt.FloatTensor(size).zero_()
+         # The backpointers at each time-step.
+         self.prevKs = []
+         # The outputs at each time-step.
+         self.nextYs = [self.tt.LongTensor(size)
+                        .fill_(0)]
+         self.nextYs[0][0] = sos
+         # Has EOS topped the beam yet.
+         self._eos = eos
+         self.eosTop = False
+         # Time and k pair for finished.
+         self.finished = []
+ 
+     def getCurrentState(self):
+         "Get the outputs for the current timestep."
+         batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
+         return batch
+ 
+     def getCurrentOrigin(self):
+         "Get the backpointers for the current timestep."
+         return self.prevKs[-1]
+ 
+     def advance(self, wordLk):
+         """
+         Given prob over words for every last beam `wordLk` and attention
+         `attnOut`: Compute and update the beam search.
+ 
+         Parameters:
+ 
+         * `wordLk`- probs of advancing from the last step (K x words)
+         * `attnOut`- attention at the last step
+ 
+         Returns: True if beam search is complete.
+         """
+         numWords = wordLk.size(1)
+ 
+         # Sum the previous scores.
+         if len(self.prevKs) > 0:
+             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
+ 
+             # Don't let EOS have children.
+             for i in range(self.nextYs[-1].size(0)):
+                 if self.nextYs[-1][i] == self._eos:
+                     beamLk[i] = -1e20
+         else:
+             beamLk = wordLk[0]
+         flatBeamLk = beamLk.view(-1)
+         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
+ 
+         self.scores = bestScores
+ 
+         # bestScoresId is flattened beam x word array, so calculate which
+         # word and beam each score came from
+         prevK = bestScoresId / numWords
+         self.prevKs.append(prevK)
+         self.nextYs.append((bestScoresId - prevK * numWords))
+ 
+ 
+         for i in range(self.nextYs[-1].size(0)):
+             if self.nextYs[-1][i] == self._eos:
+                 s = self.scores[i]
+                 self.finished.append((s, len(self.nextYs) - 1, i))
+ 
+         # End condition is when top-of-beam is EOS and no global score.
+         if self.nextYs[-1][0] == self._eos:
+             self.eosTop = True
+ 
+     def done(self):
+         return self.eosTop and len(self.finished) >=self.size
+ 
+     def getFinal(self):
+         if len(self.finished) == 0:
+             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
+         self.finished.sort(key=lambda a: -a[0])
+         if len(self.finished) != self.size:
+             unfinished=[]
+             for i in range(self.nextYs[-1].size(0)):
+                 if self.nextYs[-1][i] != self._eos:
+                     s = self.scores[i]
+                     unfinished.append((s, len(self.nextYs) - 1, i)) 
+             unfinished.sort(key=lambda a: -a[0])
+             self.finished+=unfinished[:self.size-len(self.finished)]
+         return self.finished[:self.size]
+ 
+     def getHyp(self, beam_res):
+         """
+         Walk back to construct the full hypothesis.
+         """
+         hyps=[]
+         for _,timestep, k in beam_res:
+             hyp = []
+             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
+                 hyp.append(self.nextYs[j+1][k])
+                 k = self.prevKs[j][k]
+             hyps.append(hyp[::-1])
+         return hyps
+     
+     def buildTargetTokens(self, preds):
+         sentence=[]
+         for pred in preds:
+             tokens = []
+             for tok in pred:
+                 if tok==self._eos:
+                     break
+                 tokens.append(tok)
+             sentence.append(tokens)
+         return sentence
+         
--- a/code2nl/run.py 0 → 100644
View file @f552b83
+++ b/code2nl/run.py 0 → 100644
View file @f552b83
+ # coding=utf-8
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+ # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ """
+ Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+ using a masked language modeling (MLM) loss.
+ """
+ 
+ from __future__ import absolute_import
+ import os
+ import sys
+ import bleu
+ import pickle
+ import torch
+ import json
+ import random
+ import logging
+ import argparse
+ import numpy as np
+ from io import open
+ from itertools import cycle
+ import torch.nn as nn
+ from model import Seq2Seq
+ from tqdm import tqdm, trange
+ from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
+ from torch.utils.data.distributed import DistributedSampler
+ from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
+                           RobertaConfig, RobertaModel, RobertaTokenizer)
+ MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
+ 
+ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                     datefmt = '%m/%d/%Y %H:%M:%S',
+                     level = logging.INFO)
+ logger = logging.getLogger(__name__)
+ 
+ class Example(object):
+     """A single training/test example."""
+     def __init__(self,
+                  idx,
+                  source,
+                  target,
+                  ):
+         self.idx = idx
+         self.source = source
+         self.target = target
+ 
+ def read_examples(filename):
+     """Read examples from filename."""
+     examples=[]
+     with open(filename,encoding="utf-8") as f:
+         for idx, line in enumerate(f):
+             line=line.strip()
+             js=json.loads(line)
+             if 'idx' not in js:
+                 js['idx']=idx
+             code=' '.join(js['code_tokens']).replace('\n',' ')
+             code=' '.join(code.strip().split())
+             nl=' '.join(js['docstring_tokens']).replace('\n','')
+             nl=' '.join(nl.strip().split())            
+             examples.append(
+                 Example(
+                         idx = idx,
+                         source=code,
+                         target = nl,
+                         ) 
+             )
+     return examples
+ 
+ 
+ class InputFeatures(object):
+     """A single training/test features for a example."""
+     def __init__(self,
+                  example_id,
+                  source_ids,
+                  target_ids,
+                  source_mask,
+                  target_mask,
+ 
+     ):
+         self.example_id = example_id
+         self.source_ids = source_ids
+         self.target_ids = target_ids
+         self.source_mask = source_mask
+         self.target_mask = target_mask       
+         
+ 
+ 
+ def convert_examples_to_features(examples, tokenizer, args,stage=None):
+     features = []
+     for example_index, example in enumerate(examples):
+         #source
+         source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-2]
+         source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
+         source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
+         source_mask = [1] * (len(source_tokens))
+         padding_length = args.max_source_length - len(source_ids)
+         source_ids+=[tokenizer.pad_token_id]*padding_length
+         source_mask+=[0]*padding_length
+  
+         #target
+         if stage=="test":
+             target_tokens = tokenizer.tokenize("None")
+         else:
+             target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
+         target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
+         target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
+         target_mask = [1] *len(target_ids)
+         padding_length = args.max_target_length - len(target_ids)
+         target_ids+=[tokenizer.pad_token_id]*padding_length
+         target_mask+=[0]*padding_length   
+    
+         if example_index < 5:
+             if stage=='train':
+                 logger.info("*** Example ***")
+                 logger.info("idx: {}".format(example.idx))
+ 
+                 logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
+                 logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
+                 logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
+                 
+                 logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
+                 logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
+                 logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
+        
+         features.append(
+             InputFeatures(
+                  example_index,
+                  source_ids,
+                  target_ids,
+                  source_mask,
+                  target_mask,
+             )
+         )
+     return features
+ 
+ 
+ 
+ def set_seed(args):
+     """set random seed."""
+     random.seed(args.seed)
+     np.random.seed(args.seed)
+     torch.manual_seed(args.seed)
+     if args.n_gpu > 0:
+         torch.cuda.manual_seed_all(args.seed)
+         
+ def main():
+     parser = argparse.ArgumentParser()
+ 
+     ## Required parameters  
+     parser.add_argument("--model_type", default=None, type=str, required=True,
+                         help="Model type: e.g. roberta")
+     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                         help="Path to pre-trained model: e.g. roberta-base" )   
+     parser.add_argument("--output_dir", default=None, type=str, required=True,
+                         help="The output directory where the model predictions and checkpoints will be written.")
+     parser.add_argument("--load_model_path", default=None, type=str, 
+                         help="Path to trained model: Should contain the .bin files" )    
+     ## Other parameters
+     parser.add_argument("--train_filename", default=None, type=str, 
+                         help="The train filename. Should contain the .jsonl files for this task.")
+     parser.add_argument("--dev_filename", default=None, type=str, 
+                         help="The dev filename. Should contain the .jsonl files for this task.")
+     parser.add_argument("--test_filename", default=None, type=str, 
+                         help="The test filename. Should contain the .jsonl files for this task.")  
+     
+     parser.add_argument("--config_name", default="", type=str,
+                         help="Pretrained config name or path if not the same as model_name")
+     parser.add_argument("--tokenizer_name", default="", type=str,
+                         help="Pretrained tokenizer name or path if not the same as model_name") 
+     parser.add_argument("--max_source_length", default=64, type=int,
+                         help="The maximum total source sequence length after tokenization. Sequences longer "
+                              "than this will be truncated, sequences shorter will be padded.")
+     parser.add_argument("--max_target_length", default=32, type=int,
+                         help="The maximum total target sequence length after tokenization. Sequences longer "
+                              "than this will be truncated, sequences shorter will be padded.")
+     
+     parser.add_argument("--do_train", action='store_true',
+                         help="Whether to run training.")
+     parser.add_argument("--do_eval", action='store_true',
+                         help="Whether to run eval on the dev set.")
+     parser.add_argument("--do_test", action='store_true',
+                         help="Whether to run eval on the dev set.")
+     parser.add_argument("--do_lower_case", action='store_true',
+                         help="Set this flag if you are using an uncased model.")
+     parser.add_argument("--no_cuda", action='store_true',
+                         help="Avoid using CUDA when available") 
+     
+     parser.add_argument("--train_batch_size", default=8, type=int,
+                         help="Batch size per GPU/CPU for training.")
+     parser.add_argument("--eval_batch_size", default=8, type=int,
+                         help="Batch size per GPU/CPU for evaluation.")
+     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                         help="Number of updates steps to accumulate before performing a backward/update pass.")
+     parser.add_argument("--learning_rate", default=5e-5, type=float,
+                         help="The initial learning rate for Adam.")
+     parser.add_argument("--beam_size", default=10, type=int,
+                         help="beam size for beam search")    
+     parser.add_argument("--weight_decay", default=0.0, type=float,
+                         help="Weight deay if we apply some.")
+     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                         help="Epsilon for Adam optimizer.")
+     parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                         help="Max gradient norm.")
+     parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                         help="Total number of training epochs to perform.")
+     parser.add_argument("--max_steps", default=-1, type=int,
+                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+     parser.add_argument("--eval_steps", default=-1, type=int,
+                         help="")
+     parser.add_argument("--train_steps", default=-1, type=int,
+                         help="")
+     parser.add_argument("--warmup_steps", default=0, type=int,
+                         help="Linear warmup over warmup_steps.")
+     parser.add_argument("--local_rank", type=int, default=-1,
+                         help="For distributed training: local_rank")   
+     parser.add_argument('--seed', type=int, default=42,
+                         help="random seed for initialization")
+     # print arguments
+     args = parser.parse_args()
+     logger.info(args)
+ 
+     # Setup CUDA, GPU & distributed training
+     if args.local_rank == -1 or args.no_cuda:
+         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+         args.n_gpu = torch.cuda.device_count()
+     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+         torch.cuda.set_device(args.local_rank)
+         device = torch.device("cuda", args.local_rank)
+         torch.distributed.init_process_group(backend='nccl')
+         args.n_gpu = 1
+     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
+                     args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
+     args.device = device
+     # Set seed
+     set_seed(args)
+     # make dir if output_dir not exist
+     if os.path.exists(args.output_dir) is False:
+         os.makedirs(args.output_dir)
+         
+     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)
+     
+     #budild model
+     encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
+     decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
+     decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+     model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
+                   beam_size=args.beam_size,max_length=args.max_target_length,
+                   sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
+     if args.load_model_path is not None:
+         logger.info("reload model from {}".format(args.load_model_path))
+         model.load_state_dict(torch.load(args.load_model_path))
+         
+     model.to(device)
+     if args.local_rank != -1:
+         # Distributed training
+         try:
+             from apex.parallel import DistributedDataParallel as DDP
+         except ImportError:
+             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+ 
+         model = DDP(model)
+     elif args.n_gpu > 1:
+         # multi-gpu training
+         model = torch.nn.DataParallel(model)
+ 
+ 
+ 
+ 
+     if args.do_train:
+         # Prepare training data loader
+         train_examples = read_examples(args.train_filename)
+         train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
+         all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
+         all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long)
+         all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
+         all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long)    
+         train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
+         
+         if args.local_rank == -1:
+             train_sampler = RandomSampler(train_data)
+         else:
+             train_sampler = DistributedSampler(train_data)
+         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps)
+ 
+         num_train_optimization_steps =  args.train_steps
+ 
+         # Prepare optimizer and schedule (linear warmup and decay)
+         no_decay = ['bias', 'LayerNorm.weight']
+         optimizer_grouped_parameters = [
+             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+              'weight_decay': args.weight_decay},
+             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+         ]
+         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+         scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
+                                                     num_training_steps=num_train_optimization_steps)
+     
+         
+         #Start training
+         logger.info("***** Running training *****")
+         logger.info("  Num examples = %d", len(train_examples))
+         logger.info("  Batch size = %d", args.train_batch_size)
+         logger.info("  Num epoch = %d", num_train_optimization_steps*args.train_batch_size//len(train_examples))
+         
+ 
+         model.train()
+         dev_dataset={}
+         nb_tr_examples, nb_tr_steps,tr_loss,global_step,best_bleu,best_loss = 0, 0,0,0,0,1e6 
+         bar = tqdm(range(num_train_optimization_steps),total=num_train_optimization_steps)
+         train_dataloader=cycle(train_dataloader)
+         eval_flag = True
+         for step in bar:
+             batch = next(train_dataloader)
+             batch = tuple(t.to(device) for t in batch)
+             source_ids,source_mask,target_ids,target_mask = batch
+             loss,_,_ = model(source_ids=source_ids,source_mask=source_mask,target_ids=target_ids,target_mask=target_mask)
+             
+             if args.n_gpu > 1:
+                 loss = loss.mean() # mean() to average on multi-gpu.
+             if args.gradient_accumulation_steps > 1:
+                 loss = loss / args.gradient_accumulation_steps
+             tr_loss += loss.item()
+             train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
+             bar.set_description("loss {}".format(train_loss))
+             nb_tr_examples += source_ids.size(0)
+             nb_tr_steps += 1
+             loss.backward()
+ 
+             if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
+                 #Update parameters
+                 optimizer.step()
+                 optimizer.zero_grad()
+                 scheduler.step()
+                 global_step += 1
+                 eval_flag = True
+                 
+             if args.do_eval and ((global_step + 1) %args.eval_steps == 0) and eval_flag:
+                 #Eval model with dev dataset
+                 tr_loss = 0
+                 nb_tr_examples, nb_tr_steps = 0, 0                     
+                 eval_flag=False    
+                 if 'dev_loss' in dev_dataset:
+                     eval_examples,eval_data=dev_dataset['dev_loss']
+                 else:
+                     eval_examples = read_examples(args.dev_filename)
+                     eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
+                     all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
+                     all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)
+                     all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
+                     all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long)      
+                     eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)   
+                     dev_dataset['dev_loss']=eval_examples,eval_data
+                 eval_sampler = SequentialSampler(eval_data)
+                 eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+                 
+                 logger.info("\n***** Running evaluation *****")
+                 logger.info("  Num examples = %d", len(eval_examples))
+                 logger.info("  Batch size = %d", args.eval_batch_size)
+ 
+                 #Start Evaling model
+                 model.eval()
+                 eval_loss,tokens_num = 0,0
+                 for batch in eval_dataloader:
+                     batch = tuple(t.to(device) for t in batch)
+                     source_ids,source_mask,target_ids,target_mask = batch                  
+ 
+                     with torch.no_grad():
+                         _,loss,num = model(source_ids=source_ids,source_mask=source_mask,
+                                            target_ids=target_ids,target_mask=target_mask)     
+                     eval_loss += loss.sum().item()
+                     tokens_num += num.sum().item()
+                 #Pring loss of dev dataset    
+                 model.train()
+                 eval_loss = eval_loss / tokens_num
+                 result = {'eval_ppl': round(np.exp(eval_loss),5),
+                           'global_step': global_step+1,
+                           'train_loss': round(train_loss,5)}
+                 for key in sorted(result.keys()):
+                     logger.info("  %s = %s", key, str(result[key]))
+                 logger.info("  "+"*"*20)   
+                 
+                 #save last checkpoint
+                 last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
+                 if not os.path.exists(last_output_dir):
+                     os.makedirs(last_output_dir)
+                 model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                 output_model_file = os.path.join(last_output_dir, "pytorch_model.bin")
+                 torch.save(model_to_save.state_dict(), output_model_file)                    
+                 if eval_loss<best_loss:
+                     logger.info("  Best ppl:%s",round(np.exp(eval_loss),5))
+                     logger.info("  "+"*"*20)
+                     best_loss=eval_loss
+                     # Save best checkpoint for best ppl
+                     output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl')
+                     if not os.path.exists(output_dir):
+                         os.makedirs(output_dir)
+                     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                     output_model_file = os.path.join(output_dir, "pytorch_model.bin")
+                     torch.save(model_to_save.state_dict(), output_model_file)  
+                             
+                             
+                 #Calculate bleu  
+                 if 'dev_bleu' in dev_dataset:
+                     eval_examples,eval_data=dev_dataset['dev_bleu']
+                 else:
+                     eval_examples = read_examples(args.dev_filename)
+                     eval_examples = random.sample(eval_examples,min(1000,len(eval_examples)))
+                     eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
+                     all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
+                     all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
+                     eval_data = TensorDataset(all_source_ids,all_source_mask)   
+                     dev_dataset['dev_bleu']=eval_examples,eval_data
+ 
+ 
+                 
+                 eval_sampler = SequentialSampler(eval_data)
+                 eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+ 
+                 model.eval() 
+                 p=[]
+                 for batch in eval_dataloader:
+                     batch = tuple(t.to(device) for t in batch)
+                     source_ids,source_mask= batch                  
+                     with torch.no_grad():
+                         preds = model(source_ids=source_ids,source_mask=source_mask)  
+                         for pred in preds:
+                             t=pred[0].cpu().numpy()
+                             t=list(t)
+                             if 0 in t:
+                                 t=t[:t.index(0)]
+                             text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
+                             p.append(text)
+                 model.train()
+                 predictions=[]
+                 with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
+                     for ref,gold in zip(p,eval_examples):
+                         predictions.append(str(gold.idx)+'\t'+ref)
+                         f.write(str(gold.idx)+'\t'+ref+'\n')
+                         f1.write(str(gold.idx)+'\t'+gold.target+'\n')     
+ 
+                 (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "dev.gold")) 
+                 dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
+                 logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
+                 logger.info("  "+"*"*20)    
+                 if dev_bleu>best_bleu:
+                     logger.info("  Best bleu:%s",dev_bleu)
+                     logger.info("  "+"*"*20)
+                     best_bleu=dev_bleu
+                     # Save best checkpoint for best bleu
+                     output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu')
+                     if not os.path.exists(output_dir):
+                         os.makedirs(output_dir)
+                     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                     output_model_file = os.path.join(output_dir, "pytorch_model.bin")
+                     torch.save(model_to_save.state_dict(), output_model_file)
+                
+     if args.do_test:
+         files=[]
+         if args.dev_filename is not None:
+             files.append(args.dev_filename)
+         if args.test_filename is not None:
+             files.append(args.test_filename)
+         for idx,file in enumerate(files):   
+             logger.info("Test file: {}".format(file))
+             eval_examples = read_examples(file)
+             eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
+             all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
+             all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
+             eval_data = TensorDataset(all_source_ids,all_source_mask)   
+ 
+             # Calculate bleu
+             eval_sampler = SequentialSampler(eval_data)
+             eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+ 
+             model.eval() 
+             p=[]
+             for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
+                 batch = tuple(t.to(device) for t in batch)
+                 source_ids,source_mask= batch                  
+                 with torch.no_grad():
+                     preds = model(source_ids=source_ids,source_mask=source_mask)  
+                     for pred in preds:
+                         t=pred[0].cpu().numpy()
+                         t=list(t)
+                         if 0 in t:
+                             t=t[:t.index(0)]
+                         text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
+                         p.append(text)
+             model.train()
+             predictions=[]
+             with open(os.path.join(args.output_dir,"test_{}.output".format(str(idx))),'w') as f, open(os.path.join(args.output_dir,"test_{}.gold".format(str(idx))),'w') as f1:
+                 for ref,gold in zip(p,eval_examples):
+                     predictions.append(str(gold.idx)+'\t'+ref)
+                     f.write(str(gold.idx)+'\t'+ref+'\n')
+                     f1.write(str(gold.idx)+'\t'+gold.target+'\n')     
+ 
+             (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "test_{}.gold".format(idx))) 
+             dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
+             logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
+             logger.info("  "+"*"*20)    
+ 
+ 
+ 
+                             
+ 
+                 
+                 
+ if __name__ == "__main__":
+     main()
+ 
+