Toggle navigation
Toggle navigation
This project
Loading...
Sign in
graykode
/
commit-autosuggestions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
graykode
2020-11-02 20:13:43 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
042ef27aaccc096d3fbc0110847f4e0479f86551
042ef27a
1 parent
4c9d9868
(add) unittest for api
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
173 additions
and
0 deletions
src/api.py
src/test.source
src/api.py
0 → 100644
View file @
042ef27
# Copyright 2020-present Tae Hwan Jung
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
torch
import
logging
from
tqdm
import
tqdm
import
torch.nn
as
nn
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
SequentialSampler
from
transformers
import
(
RobertaConfig
,
RobertaTokenizer
)
import
argparse
import
whatthepatch
from
train.run
import
(
Example
,
convert_examples_to_features
)
from
train.model
import
Seq2Seq
from
train.customized_roberta
import
RobertaModel
MODEL_CLASSES
=
{
'roberta'
:
(
RobertaConfig
,
RobertaModel
,
RobertaTokenizer
)}
logging
.
basicConfig
(
format
=
'
%(asctime)
s -
%(levelname)
s -
%(name)
s -
%(message)
s'
,
datefmt
=
'
%
m/
%
d/
%
Y
%
H:
%
M:
%
S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
create_examples
(
diff
,
tokenizer
):
examples
=
[]
for
idx
,
example
in
enumerate
(
whatthepatch
.
parse_patch
(
diff
)):
added
,
deleted
=
[],
[]
for
change
in
example
.
changes
:
if
change
.
old
==
None
and
change
.
new
!=
None
:
added
.
extend
(
tokenizer
.
tokenize
(
change
.
line
))
elif
change
.
old
!=
None
and
change
.
new
==
None
:
deleted
.
extend
(
tokenizer
.
tokenize
(
change
.
line
))
examples
.
append
(
Example
(
idx
=
idx
,
added
=
added
,
deleted
=
deleted
,
target
=
None
)
)
return
examples
def
main
(
args
):
config_class
,
model_class
,
tokenizer_class
=
MODEL_CLASSES
[
args
.
model_type
]
config
=
config_class
.
from_pretrained
(
args
.
config_name
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
tokenizer_name
,
do_lower_case
=
args
.
do_lower_case
)
# budild model
encoder
=
model_class
(
config
=
config
)
decoder_layer
=
nn
.
TransformerDecoderLayer
(
d_model
=
config
.
hidden_size
,
nhead
=
config
.
num_attention_heads
)
decoder
=
nn
.
TransformerDecoder
(
decoder_layer
,
num_layers
=
6
)
model
=
Seq2Seq
(
encoder
=
encoder
,
decoder
=
decoder
,
config
=
config
,
beam_size
=
args
.
beam_size
,
max_length
=
args
.
max_target_length
,
sos_id
=
tokenizer
.
cls_token_id
,
eos_id
=
tokenizer
.
sep_token_id
)
if
args
.
load_model_path
is
not
None
:
logger
.
info
(
"reload model from {}"
.
format
(
args
.
load_model_path
))
model
.
load_state_dict
(
torch
.
load
(
args
.
load_model_path
),
strict
=
False
)
model
.
to
(
args
.
device
)
with
open
(
"test.source"
,
"r"
)
as
f
:
eval_examples
=
create_examples
(
f
.
read
(),
tokenizer
)
test_features
=
convert_examples_to_features
(
eval_examples
,
tokenizer
,
args
,
stage
=
'test'
)
all_source_ids
=
torch
.
tensor
([
f
.
source_ids
for
f
in
test_features
],
dtype
=
torch
.
long
)
all_source_mask
=
torch
.
tensor
([
f
.
source_mask
for
f
in
test_features
],
dtype
=
torch
.
long
)
all_patch_ids
=
torch
.
tensor
([
f
.
patch_ids
for
f
in
test_features
],
dtype
=
torch
.
long
)
test_data
=
TensorDataset
(
all_source_ids
,
all_source_mask
,
all_patch_ids
)
# Calculate bleu
eval_sampler
=
SequentialSampler
(
test_data
)
eval_dataloader
=
DataLoader
(
test_data
,
sampler
=
eval_sampler
,
batch_size
=
len
(
test_data
))
model
.
eval
()
for
batch
in
tqdm
(
eval_dataloader
,
total
=
len
(
eval_dataloader
)):
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
source_ids
,
source_mask
,
patch_ids
=
batch
with
torch
.
no_grad
():
preds
=
model
(
source_ids
=
source_ids
,
source_mask
=
source_mask
,
patch_ids
=
patch_ids
)
for
pred
in
preds
:
t
=
pred
[
0
]
.
cpu
()
.
numpy
()
t
=
list
(
t
)
if
0
in
t
:
t
=
t
[:
t
.
index
(
0
)]
text
=
tokenizer
.
decode
(
t
,
clean_up_tokenization_spaces
=
False
)
print
(
text
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
""
)
parser
.
add_argument
(
"--load_model_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to trained model: Should contain the .bin files"
)
parser
.
add_argument
(
"--model_type"
,
default
=
'roberta'
,
type
=
str
,
help
=
"Model type: e.g. roberta"
)
parser
.
add_argument
(
"--config_name"
,
default
=
"microsoft/codebert-base"
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
)
parser
.
add_argument
(
"--tokenizer_name"
,
type
=
str
,
default
=
"microsoft/codebert-base"
,
help
=
"The name of tokenizer"
,
)
parser
.
add_argument
(
"--max_source_length"
,
default
=
256
,
type
=
int
,
help
=
"The maximum total source sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
parser
.
add_argument
(
"--max_target_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total target sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
parser
.
add_argument
(
"--beam_size"
,
default
=
10
,
type
=
int
,
help
=
"beam size for beam search"
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Avoid using CUDA when available"
)
args
=
parser
.
parse_args
()
args
.
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
main
(
args
)
\ No newline at end of file
src/test.source
0 → 100644
View file @
042ef27
diff --git a/src/train/model.py b/src/train/model.py
index 20e56b3..cab82e5 100644
--- a/src/train/model.py
+++ b/src/train/model.py
@@ -3,9 +3,7 @@
import torch
import torch.nn as nn
-import torch
-from torch.autograd import Variable
-import copy
+
class Seq2Seq(nn.Module):
"""
Build Seqence-to-Sequence.
diff --git a/src/train/run.py b/src/train/run.py
index 5961ad1..be98fec 100644
--- a/src/train/run.py
+++ b/src/train/run.py
@@ -22,7 +22,6 @@ using a masked language modeling (MLM) loss.
from __future__ import absolute_import
import os
import sys
-import bleu
import pickle
import torch
import json
@@ -35,11 +34,14 @@ from itertools import cycle
import torch.nn as nn
from model import Seq2Seq
from tqdm import tqdm, trange
-from customized_roberta import RobertaModel
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
RobertaConfig, RobertaTokenizer)
+
+import train.bleu as bleu
+from train.customized_roberta import RobertaModel
+
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
Please
register
or
login
to post a comment