(add) code2nl finetuning code

graykode
Commit f552b8349497e39893ae1f13b478cdffe468872e f552b834 1 parent 3d9624b7
Showing 3 changed files with 945 additions and 0 deletions
code2nl/bleu.py
code2nl/model.py
code2nl/run.py
--- a/code2nl/bleu.py 0 → 100644
View file @f552b83
+++ b/code2nl/bleu.py 0 → 100644
View file @f552b83
+#!/usr/bin/python
+
+'''
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
+'''
+
+# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+
+'''Provides:
+
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+score_cooked(alltest, n=4): Score a list of cooked test sentences.
+
+score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
+
+The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
+'''
+
+import sys, math, re, xml.sax.saxutils
+import subprocess
+import os
+
+# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+nonorm = 0
+
+preserve_case = False
+eff_ref_len = "shortest"
+
+normalize1 = [
+    ('<skipped>', ''),         # strip "skipped" tags
+    (r'-\n', ''),              # strip end-of-line hyphenation and join lines
+    (r'\n', ' '),              # join lines
+#    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+]
+normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
+
+normalize2 = [
+    (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
+    (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
+    (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
+    (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
+]
+normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
+
+def normalize(s):
+    '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
+    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+    if (nonorm):
+        return s.split()
+    if type(s) is not str:
+        s = " ".join(s)
+    # language-independent part:
+    for (pattern, replace) in normalize1:
+        s = re.sub(pattern, replace, s)
+    s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
+    # language-dependent part (assuming Western languages):
+    s = " %s " % s
+    if not preserve_case:
+        s = s.lower()         # this might not be identical to the original
+    for (pattern, replace) in normalize2:
+        s = re.sub(pattern, replace, s)
+    return s.split()
+
+def count_ngrams(words, n=4):
+    counts = {}
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
+            ngram = tuple(words[i:i+k])
+            counts[ngram] = counts.get(ngram, 0)+1
+    return counts
+
+def cook_refs(refs, n=4):
+    '''Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them.'''
+    
+    refs = [normalize(ref) for ref in refs]
+    maxcounts = {}
+    for ref in refs:
+        counts = count_ngrams(ref, n)
+        for (ngram,count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+    return ([len(ref) for ref in refs], maxcounts)
+
+def cook_test(test, item, n=4):
+    '''Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it.'''
+    (reflens, refmaxcounts)=item
+    test = normalize(test)
+    result = {}
+    result["testlen"] = len(test)
+
+    # Calculate effective reference sentence length.
+    
+    if eff_ref_len == "shortest":
+        result["reflen"] = min(reflens)
+    elif eff_ref_len == "average":
+        result["reflen"] = float(sum(reflens))/len(reflens)
+    elif eff_ref_len == "closest":
+        min_diff = None
+        for reflen in reflens:
+            if min_diff is None or abs(reflen-len(test)) < min_diff:
+                min_diff = abs(reflen-len(test))
+                result['reflen'] = reflen
+
+    result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
+
+    result['correct'] = [0]*n
+    counts = count_ngrams(test, n)
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+
+    return result
+
+def score_cooked(allcomps, n=4, ground=0, smooth=1):
+    totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+    for comps in allcomps:
+        for key in ['testlen','reflen']:
+            totalcomps[key] += comps[key]
+        for key in ['guess','correct']:
+            for k in range(n):
+                totalcomps[key][k] += comps[key][k]
+    logbleu = 0.0
+    all_bleus = []
+    for k in range(n):
+      correct = totalcomps['correct'][k]
+      guess = totalcomps['guess'][k]
+      addsmooth = 0
+      if smooth == 1 and k > 0:
+        addsmooth = 1
+      logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
+      if guess == 0:
+        all_bleus.append(-10000000)
+      else:
+        all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
+
+    logbleu /= float(n)
+    all_bleus.insert(0, logbleu)
+
+    brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
+    for i in range(len(all_bleus)):
+      if i ==0:
+        all_bleus[i] += brevPenalty
+      all_bleus[i] = math.exp(all_bleus[i])
+    return all_bleus
+
+def bleu(refs,  candidate, ground=0, smooth=1):
+    refs = cook_refs(refs)
+    test = cook_test(candidate, refs)
+    return score_cooked([test], ground=ground, smooth=smooth)
+
+def splitPuncts(line):
+  return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
+
+def computeMaps(predictions, goldfile):
+  predictionMap = {}
+  goldMap = {}
+  gf = open(goldfile, 'r')
+
+  for row in predictions:
+    cols = row.strip().split('\t')
+    if len(cols) == 1:
+      (rid, pred) = (cols[0], '') 
+    else:
+      (rid, pred) = (cols[0], cols[1]) 
+    predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+  for row in gf:
+    (rid, pred) = row.split('\t') 
+    if rid in predictionMap: # Only insert if the id exists for the method
+      if rid not in goldMap:
+        goldMap[rid] = []
+      goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+  sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
+  return (goldMap, predictionMap)
+
+
+#m1 is the reference map
+#m2 is the prediction map
+def bleuFromMaps(m1, m2):
+  score = [0] * 5
+  num = 0.0
+
+  for key in m1:
+    if key in m2:
+      bl = bleu(m1[key], m2[key][0])
+      score = [ score[i] + bl[i] for i in range(0, len(bl))]
+      num += 1
+  return [s * 100.0 / num for s in score]
+
+if __name__ == '__main__':
+  reference_file = sys.argv[1]
+  predictions = []
+  for row in sys.stdin:
+    predictions.append(row)
+  (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
+  print (bleuFromMaps(goldMap, predictionMap)[0])
+
--- a/code2nl/model.py 0 → 100644
View file @f552b83
+++ b/code2nl/model.py 0 → 100644
View file @f552b83
+# Copyright (c) Microsoft Corporation. 
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+import torch
+from torch.autograd import Variable
+import copy
+class Seq2Seq(nn.Module):
+    """
+        Build Seqence-to-Sequence.
+        
+        Parameters:
+
+        * `encoder`- encoder of seq2seq model. e.g. roberta
+        * `decoder`- decoder of seq2seq model. e.g. transformer
+        * `config`- configuration of encoder model. 
+        * `beam_size`- beam size for beam search. 
+        * `max_length`- max length of target for beam search. 
+        * `sos_id`- start of symbol ids in target for beam search.
+        * `eos_id`- end of symbol ids in target for beam search. 
+    """
+    def __init__(self, encoder,decoder,config,beam_size=None,max_length=None,sos_id=None,eos_id=None):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder=decoder
+        self.config=config
+        self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lsm = nn.LogSoftmax(dim=-1)
+        self.tie_weights()
+        
+        self.beam_size=beam_size
+        self.max_length=max_length
+        self.sos_id=sos_id
+        self.eos_id=eos_id
+        
+    def _tie_or_clone_weights(self, first_module, second_module):
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
+        """
+        if self.config.torchscript:
+            first_module.weight = nn.Parameter(second_module.weight.clone())
+        else:
+            first_module.weight = second_module.weight
+                  
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.encoder.embeddings.word_embeddings)        
+        
+    def forward(self, source_ids=None,source_mask=None,target_ids=None,target_mask=None,args=None):   
+        outputs = self.encoder(source_ids, attention_mask=source_mask)
+        encoder_output = outputs[0].permute([1,0,2]).contiguous()
+        if target_ids is not None:  
+            attn_mask=-1e4 *(1-self.bias[:target_ids.shape[1],:target_ids.shape[1]])
+            tgt_embeddings = self.encoder.embeddings(target_ids).permute([1,0,2]).contiguous()
+            out = self.decoder(tgt_embeddings,encoder_output,tgt_mask=attn_mask,memory_key_padding_mask=(1-source_mask).bool())
+            hidden_states = torch.tanh(self.dense(out)).permute([1,0,2]).contiguous()
+            lm_logits = self.lm_head(hidden_states)
+            # Shift so that tokens < n predict n
+            active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = target_ids[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))[active_loss],
+                            shift_labels.view(-1)[active_loss])
+
+            outputs = loss,loss*active_loss.sum(),active_loss.sum()
+            return outputs
+        else:
+            #Predict 
+            preds=[]       
+            zero=torch.cuda.LongTensor(1).fill_(0)     
+            for i in range(source_ids.shape[0]):
+                context=encoder_output[:,i:i+1]
+                context_mask=source_mask[i:i+1,:]
+                beam = Beam(self.beam_size,self.sos_id,self.eos_id)
+                input_ids=beam.getCurrentState()
+                context=context.repeat(1, self.beam_size,1)
+                context_mask=context_mask.repeat(self.beam_size,1)
+                for _ in range(self.max_length): 
+                    if beam.done():
+                        break
+                    attn_mask=-1e4 *(1-self.bias[:input_ids.shape[1],:input_ids.shape[1]])
+                    tgt_embeddings = self.encoder.embeddings(input_ids).permute([1,0,2]).contiguous()
+                    out = self.decoder(tgt_embeddings,context,tgt_mask=attn_mask,memory_key_padding_mask=(1-context_mask).bool())
+                    out = torch.tanh(self.dense(out))
+                    hidden_states=out.permute([1,0,2]).contiguous()[:,-1,:]
+                    out = self.lsm(self.lm_head(hidden_states)).data
+                    beam.advance(out)
+                    input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
+                    input_ids=torch.cat((input_ids,beam.getCurrentState()),-1)
+                hyp= beam.getHyp(beam.getFinal())
+                pred=beam.buildTargetTokens(hyp)[:self.beam_size]
+                pred=[torch.cat([x.view(-1) for x in p]+[zero]*(self.max_length-len(p))).view(1,-1) for p in pred]
+                preds.append(torch.cat(pred,0).unsqueeze(0))
+                
+            preds=torch.cat(preds,0)                
+            return preds   
+        
+        
+
+class Beam(object):
+    def __init__(self, size,sos,eos):
+        self.size = size
+        self.tt = torch.cuda
+        # The score for each translation on the beam.
+        self.scores = self.tt.FloatTensor(size).zero_()
+        # The backpointers at each time-step.
+        self.prevKs = []
+        # The outputs at each time-step.
+        self.nextYs = [self.tt.LongTensor(size)
+                       .fill_(0)]
+        self.nextYs[0][0] = sos
+        # Has EOS topped the beam yet.
+        self._eos = eos
+        self.eosTop = False
+        # Time and k pair for finished.
+        self.finished = []
+
+    def getCurrentState(self):
+        "Get the outputs for the current timestep."
+        batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
+        return batch
+
+    def getCurrentOrigin(self):
+        "Get the backpointers for the current timestep."
+        return self.prevKs[-1]
+
+    def advance(self, wordLk):
+        """
+        Given prob over words for every last beam `wordLk` and attention
+        `attnOut`: Compute and update the beam search.
+
+        Parameters:
+
+        * `wordLk`- probs of advancing from the last step (K x words)
+        * `attnOut`- attention at the last step
+
+        Returns: True if beam search is complete.
+        """
+        numWords = wordLk.size(1)
+
+        # Sum the previous scores.
+        if len(self.prevKs) > 0:
+            beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
+
+            # Don't let EOS have children.
+            for i in range(self.nextYs[-1].size(0)):
+                if self.nextYs[-1][i] == self._eos:
+                    beamLk[i] = -1e20
+        else:
+            beamLk = wordLk[0]
+        flatBeamLk = beamLk.view(-1)
+        bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
+
+        self.scores = bestScores
+
+        # bestScoresId is flattened beam x word array, so calculate which
+        # word and beam each score came from
+        prevK = bestScoresId / numWords
+        self.prevKs.append(prevK)
+        self.nextYs.append((bestScoresId - prevK * numWords))
+
+
+        for i in range(self.nextYs[-1].size(0)):
+            if self.nextYs[-1][i] == self._eos:
+                s = self.scores[i]
+                self.finished.append((s, len(self.nextYs) - 1, i))
+
+        # End condition is when top-of-beam is EOS and no global score.
+        if self.nextYs[-1][0] == self._eos:
+            self.eosTop = True
+
+    def done(self):
+        return self.eosTop and len(self.finished) >=self.size
+
+    def getFinal(self):
+        if len(self.finished) == 0:
+            self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
+        self.finished.sort(key=lambda a: -a[0])
+        if len(self.finished) != self.size:
+            unfinished=[]
+            for i in range(self.nextYs[-1].size(0)):
+                if self.nextYs[-1][i] != self._eos:
+                    s = self.scores[i]
+                    unfinished.append((s, len(self.nextYs) - 1, i)) 
+            unfinished.sort(key=lambda a: -a[0])
+            self.finished+=unfinished[:self.size-len(self.finished)]
+        return self.finished[:self.size]
+
+    def getHyp(self, beam_res):
+        """
+        Walk back to construct the full hypothesis.
+        """
+        hyps=[]
+        for _,timestep, k in beam_res:
+            hyp = []
+            for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
+                hyp.append(self.nextYs[j+1][k])
+                k = self.prevKs[j][k]
+            hyps.append(hyp[::-1])
+        return hyps
+    
+    def buildTargetTokens(self, preds):
+        sentence=[]
+        for pred in preds:
+            tokens = []
+            for tok in pred:
+                if tok==self._eos:
+                    break
+                tokens.append(tok)
+            sentence.append(tokens)
+        return sentence
+        
--- a/code2nl/run.py 0 → 100644
View file @f552b83
+++ b/code2nl/run.py 0 → 100644
View file @f552b83
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+
+from __future__ import absolute_import
+import os
+import sys
+import bleu
+import pickle
+import torch
+import json
+import random
+import logging
+import argparse
+import numpy as np
+from io import open
+from itertools import cycle
+import torch.nn as nn
+from model import Seq2Seq
+from tqdm import tqdm, trange
+from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
+                          RobertaConfig, RobertaModel, RobertaTokenizer)
+MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+class Example(object):
+    """A single training/test example."""
+    def __init__(self,
+                 idx,
+                 source,
+                 target,
+                 ):
+        self.idx = idx
+        self.source = source
+        self.target = target
+
+def read_examples(filename):
+    """Read examples from filename."""
+    examples=[]
+    with open(filename,encoding="utf-8") as f:
+        for idx, line in enumerate(f):
+            line=line.strip()
+            js=json.loads(line)
+            if 'idx' not in js:
+                js['idx']=idx
+            code=' '.join(js['code_tokens']).replace('\n',' ')
+            code=' '.join(code.strip().split())
+            nl=' '.join(js['docstring_tokens']).replace('\n','')
+            nl=' '.join(nl.strip().split())            
+            examples.append(
+                Example(
+                        idx = idx,
+                        source=code,
+                        target = nl,
+                        ) 
+            )
+    return examples
+
+
+class InputFeatures(object):
+    """A single training/test features for a example."""
+    def __init__(self,
+                 example_id,
+                 source_ids,
+                 target_ids,
+                 source_mask,
+                 target_mask,
+
+    ):
+        self.example_id = example_id
+        self.source_ids = source_ids
+        self.target_ids = target_ids
+        self.source_mask = source_mask
+        self.target_mask = target_mask       
+        
+
+
+def convert_examples_to_features(examples, tokenizer, args,stage=None):
+    features = []
+    for example_index, example in enumerate(examples):
+        #source
+        source_tokens = tokenizer.tokenize(example.source)[:args.max_source_length-2]
+        source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
+        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens) 
+        source_mask = [1] * (len(source_tokens))
+        padding_length = args.max_source_length - len(source_ids)
+        source_ids+=[tokenizer.pad_token_id]*padding_length
+        source_mask+=[0]*padding_length
+ 
+        #target
+        if stage=="test":
+            target_tokens = tokenizer.tokenize("None")
+        else:
+            target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
+        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
+        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
+        target_mask = [1] *len(target_ids)
+        padding_length = args.max_target_length - len(target_ids)
+        target_ids+=[tokenizer.pad_token_id]*padding_length
+        target_mask+=[0]*padding_length   
+   
+        if example_index < 5:
+            if stage=='train':
+                logger.info("*** Example ***")
+                logger.info("idx: {}".format(example.idx))
+
+                logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
+                logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
+                logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
+                
+                logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
+                logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
+                logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
+       
+        features.append(
+            InputFeatures(
+                 example_index,
+                 source_ids,
+                 target_ids,
+                 source_mask,
+                 target_mask,
+            )
+        )
+    return features
+
+
+
+def set_seed(args):
+    """set random seed."""
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+        
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters  
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type: e.g. roberta")
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model: e.g. roberta-base" )   
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--load_model_path", default=None, type=str, 
+                        help="Path to trained model: Should contain the .bin files" )    
+    ## Other parameters
+    parser.add_argument("--train_filename", default=None, type=str, 
+                        help="The train filename. Should contain the .jsonl files for this task.")
+    parser.add_argument("--dev_filename", default=None, type=str, 
+                        help="The dev filename. Should contain the .jsonl files for this task.")
+    parser.add_argument("--test_filename", default=None, type=str, 
+                        help="The test filename. Should contain the .jsonl files for this task.")  
+    
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name") 
+    parser.add_argument("--max_source_length", default=64, type=int,
+                        help="The maximum total source sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--max_target_length", default=32, type=int,
+                        help="The maximum total target sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_test", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available") 
+    
+    parser.add_argument("--train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--beam_size", default=10, type=int,
+                        help="beam size for beam search")    
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--eval_steps", default=-1, type=int,
+                        help="")
+    parser.add_argument("--train_steps", default=-1, type=int,
+                        help="")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")   
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    # print arguments
+    args = parser.parse_args()
+    logger.info(args)
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
+    args.device = device
+    # Set seed
+    set_seed(args)
+    # make dir if output_dir not exist
+    if os.path.exists(args.output_dir) is False:
+        os.makedirs(args.output_dir)
+        
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)
+    
+    #budild model
+    encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
+    decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
+    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+    model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
+                  beam_size=args.beam_size,max_length=args.max_target_length,
+                  sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)
+    if args.load_model_path is not None:
+        logger.info("reload model from {}".format(args.load_model_path))
+        model.load_state_dict(torch.load(args.load_model_path))
+        
+    model.to(device)
+    if args.local_rank != -1:
+        # Distributed training
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif args.n_gpu > 1:
+        # multi-gpu training
+        model = torch.nn.DataParallel(model)
+
+
+
+
+    if args.do_train:
+        # Prepare training data loader
+        train_examples = read_examples(args.train_filename)
+        train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
+        all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long)
+        all_source_mask = torch.tensor([f.source_mask for f in train_features], dtype=torch.long)
+        all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long)
+        all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long)    
+        train_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)
+        
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps)
+
+        num_train_optimization_steps =  args.train_steps
+
+        # Prepare optimizer and schedule (linear warmup and decay)
+        no_decay = ['bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+             'weight_decay': args.weight_decay},
+            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
+                                                    num_training_steps=num_train_optimization_steps)
+    
+        
+        #Start training
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num epoch = %d", num_train_optimization_steps*args.train_batch_size//len(train_examples))
+        
+
+        model.train()
+        dev_dataset={}
+        nb_tr_examples, nb_tr_steps,tr_loss,global_step,best_bleu,best_loss = 0, 0,0,0,0,1e6 
+        bar = tqdm(range(num_train_optimization_steps),total=num_train_optimization_steps)
+        train_dataloader=cycle(train_dataloader)
+        eval_flag = True
+        for step in bar:
+            batch = next(train_dataloader)
+            batch = tuple(t.to(device) for t in batch)
+            source_ids,source_mask,target_ids,target_mask = batch
+            loss,_,_ = model(source_ids=source_ids,source_mask=source_mask,target_ids=target_ids,target_mask=target_mask)
+            
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu.
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+            tr_loss += loss.item()
+            train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
+            bar.set_description("loss {}".format(train_loss))
+            nb_tr_examples += source_ids.size(0)
+            nb_tr_steps += 1
+            loss.backward()
+
+            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
+                #Update parameters
+                optimizer.step()
+                optimizer.zero_grad()
+                scheduler.step()
+                global_step += 1
+                eval_flag = True
+                
+            if args.do_eval and ((global_step + 1) %args.eval_steps == 0) and eval_flag:
+                #Eval model with dev dataset
+                tr_loss = 0
+                nb_tr_examples, nb_tr_steps = 0, 0                     
+                eval_flag=False    
+                if 'dev_loss' in dev_dataset:
+                    eval_examples,eval_data=dev_dataset['dev_loss']
+                else:
+                    eval_examples = read_examples(args.dev_filename)
+                    eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
+                    all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
+                    all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)
+                    all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long)
+                    all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long)      
+                    eval_data = TensorDataset(all_source_ids,all_source_mask,all_target_ids,all_target_mask)   
+                    dev_dataset['dev_loss']=eval_examples,eval_data
+                eval_sampler = SequentialSampler(eval_data)
+                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+                
+                logger.info("\n***** Running evaluation *****")
+                logger.info("  Num examples = %d", len(eval_examples))
+                logger.info("  Batch size = %d", args.eval_batch_size)
+
+                #Start Evaling model
+                model.eval()
+                eval_loss,tokens_num = 0,0
+                for batch in eval_dataloader:
+                    batch = tuple(t.to(device) for t in batch)
+                    source_ids,source_mask,target_ids,target_mask = batch                  
+
+                    with torch.no_grad():
+                        _,loss,num = model(source_ids=source_ids,source_mask=source_mask,
+                                           target_ids=target_ids,target_mask=target_mask)     
+                    eval_loss += loss.sum().item()
+                    tokens_num += num.sum().item()
+                #Pring loss of dev dataset    
+                model.train()
+                eval_loss = eval_loss / tokens_num
+                result = {'eval_ppl': round(np.exp(eval_loss),5),
+                          'global_step': global_step+1,
+                          'train_loss': round(train_loss,5)}
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                logger.info("  "+"*"*20)   
+                
+                #save last checkpoint
+                last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
+                if not os.path.exists(last_output_dir):
+                    os.makedirs(last_output_dir)
+                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                output_model_file = os.path.join(last_output_dir, "pytorch_model.bin")
+                torch.save(model_to_save.state_dict(), output_model_file)                    
+                if eval_loss<best_loss:
+                    logger.info("  Best ppl:%s",round(np.exp(eval_loss),5))
+                    logger.info("  "+"*"*20)
+                    best_loss=eval_loss
+                    # Save best checkpoint for best ppl
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl')
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                    output_model_file = os.path.join(output_dir, "pytorch_model.bin")
+                    torch.save(model_to_save.state_dict(), output_model_file)  
+                            
+                            
+                #Calculate bleu  
+                if 'dev_bleu' in dev_dataset:
+                    eval_examples,eval_data=dev_dataset['dev_bleu']
+                else:
+                    eval_examples = read_examples(args.dev_filename)
+                    eval_examples = random.sample(eval_examples,min(1000,len(eval_examples)))
+                    eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
+                    all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
+                    all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
+                    eval_data = TensorDataset(all_source_ids,all_source_mask)   
+                    dev_dataset['dev_bleu']=eval_examples,eval_data
+
+
+                
+                eval_sampler = SequentialSampler(eval_data)
+                eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+                model.eval() 
+                p=[]
+                for batch in eval_dataloader:
+                    batch = tuple(t.to(device) for t in batch)
+                    source_ids,source_mask= batch                  
+                    with torch.no_grad():
+                        preds = model(source_ids=source_ids,source_mask=source_mask)  
+                        for pred in preds:
+                            t=pred[0].cpu().numpy()
+                            t=list(t)
+                            if 0 in t:
+                                t=t[:t.index(0)]
+                            text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
+                            p.append(text)
+                model.train()
+                predictions=[]
+                with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
+                    for ref,gold in zip(p,eval_examples):
+                        predictions.append(str(gold.idx)+'\t'+ref)
+                        f.write(str(gold.idx)+'\t'+ref+'\n')
+                        f1.write(str(gold.idx)+'\t'+gold.target+'\n')     
+
+                (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "dev.gold")) 
+                dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
+                logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
+                logger.info("  "+"*"*20)    
+                if dev_bleu>best_bleu:
+                    logger.info("  Best bleu:%s",dev_bleu)
+                    logger.info("  "+"*"*20)
+                    best_bleu=dev_bleu
+                    # Save best checkpoint for best bleu
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu')
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                    output_model_file = os.path.join(output_dir, "pytorch_model.bin")
+                    torch.save(model_to_save.state_dict(), output_model_file)
+               
+    if args.do_test:
+        files=[]
+        if args.dev_filename is not None:
+            files.append(args.dev_filename)
+        if args.test_filename is not None:
+            files.append(args.test_filename)
+        for idx,file in enumerate(files):   
+            logger.info("Test file: {}".format(file))
+            eval_examples = read_examples(file)
+            eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
+            all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long)
+            all_source_mask = torch.tensor([f.source_mask for f in eval_features], dtype=torch.long)    
+            eval_data = TensorDataset(all_source_ids,all_source_mask)   
+
+            # Calculate bleu
+            eval_sampler = SequentialSampler(eval_data)
+            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+            model.eval() 
+            p=[]
+            for batch in tqdm(eval_dataloader,total=len(eval_dataloader)):
+                batch = tuple(t.to(device) for t in batch)
+                source_ids,source_mask= batch                  
+                with torch.no_grad():
+                    preds = model(source_ids=source_ids,source_mask=source_mask)  
+                    for pred in preds:
+                        t=pred[0].cpu().numpy()
+                        t=list(t)
+                        if 0 in t:
+                            t=t[:t.index(0)]
+                        text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
+                        p.append(text)
+            model.train()
+            predictions=[]
+            with open(os.path.join(args.output_dir,"test_{}.output".format(str(idx))),'w') as f, open(os.path.join(args.output_dir,"test_{}.gold".format(str(idx))),'w') as f1:
+                for ref,gold in zip(p,eval_examples):
+                    predictions.append(str(gold.idx)+'\t'+ref)
+                    f.write(str(gold.idx)+'\t'+ref+'\n')
+                    f1.write(str(gold.idx)+'\t'+gold.target+'\n')     
+
+            (goldMap, predictionMap) = bleu.computeMaps(predictions, os.path.join(args.output_dir, "test_{}.gold".format(idx))) 
+            dev_bleu=round(bleu.bleuFromMaps(goldMap, predictionMap)[0],2)
+            logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
+            logger.info("  "+"*"*20)    
+
+
+
+                            
+
+                
+                
+if __name__ == "__main__":
+    main()
+
+