kobert

bongminkim
Commit f3bacae302c54c8fc301bc773d1321de613f29bf f3bacae3 1 parent 85b94b49
Showing 4 changed files with 284 additions and 0 deletions
KoBERT/kobert/__init__.py
KoBERT/kobert/mxnet_kobert.py
KoBERT/kobert/pytorch_kobert.py
KoBERT/kobert/utils.py
--- a/KoBERT/kobert/__init__.py 0 → 100644
View file @f3bacae
+++ b/KoBERT/kobert/__init__.py 0 → 100644
View file @f3bacae
+# coding=utf-8
+# Copyright 2019 SK T-Brain Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = '0.1.1'
\ No newline at end of file
--- a/KoBERT/kobert/mxnet_kobert.py 0 → 100644
View file @f3bacae
+++ b/KoBERT/kobert/mxnet_kobert.py 0 → 100644
View file @f3bacae
+# coding=utf-8
+# Copyright 2019 SK T-Brain Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import requests
+import hashlib
+
+import mxnet as mx
+import gluonnlp as nlp
+from gluonnlp.model import BERTModel, BERTEncoder
+
+from .utils import download as _download
+from .utils import tokenizer
+
+mxnet_kobert = {
+    'url':
+    'https://kobert.blob.core.windows.net/models/kobert/mxnet/mxnet_kobert_45b6957552.params',
+    'fname': 'mxnet_kobert_45b6957552.params',
+    'chksum': '45b6957552'
+}
+
+
+def get_mxnet_kobert_model(use_pooler=True,
+                           use_decoder=True,
+                           use_classifier=True,
+                           ctx=mx.cpu(0),
+                           cachedir='~/kobert/'):
+    # download model
+    model_info = mxnet_kobert
+    model_path = _download(model_info['url'],
+                           model_info['fname'],
+                           model_info['chksum'],
+                           cachedir=cachedir)
+    # download vocab
+    vocab_info = tokenizer
+    vocab_path = _download(vocab_info['url'],
+                           vocab_info['fname'],
+                           vocab_info['chksum'],
+                           cachedir=cachedir)
+    return get_kobert_model(model_path, vocab_path, use_pooler, use_decoder,
+                            use_classifier, ctx)
+
+
+def get_kobert_model(model_file,
+                     vocab_file,
+                     use_pooler=True,
+                     use_decoder=True,
+                     use_classifier=True,
+                     ctx=mx.cpu(0)):
+    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
+                                                         padding_token='[PAD]')
+
+    predefined_args = {
+        'attention_cell': 'multi_head',
+        'num_layers': 12,
+        'units': 768,
+        'hidden_size': 3072,
+        'max_length': 512,
+        'num_heads': 12,
+        'scaled': True,
+        'dropout': 0.1,
+        'use_residual': True,
+        'embed_size': 768,
+        'embed_dropout': 0.1,
+        'token_type_vocab_size': 2,
+        'word_embed': None,
+    }
+
+    encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
+                          num_layers=predefined_args['num_layers'],
+                          units=predefined_args['units'],
+                          hidden_size=predefined_args['hidden_size'],
+                          max_length=predefined_args['max_length'],
+                          num_heads=predefined_args['num_heads'],
+                          scaled=predefined_args['scaled'],
+                          dropout=predefined_args['dropout'],
+                          output_attention=False,
+                          output_all_encodings=False,
+                          use_residual=predefined_args['use_residual'])
+
+    # BERT
+    net = BERTModel(
+        encoder,
+        len(vocab_b_obj.idx_to_token),
+        token_type_vocab_size=predefined_args['token_type_vocab_size'],
+        units=predefined_args['units'],
+        embed_size=predefined_args['embed_size'],
+        embed_dropout=predefined_args['embed_dropout'],
+        word_embed=predefined_args['word_embed'],
+        use_pooler=use_pooler,
+        use_decoder=use_decoder,
+        use_classifier=use_classifier)
+    net.initialize(ctx=ctx)
+    net.load_parameters(model_file, ctx, ignore_extra=True)
+    return (net, vocab_b_obj)
--- a/KoBERT/kobert/pytorch_kobert.py 0 → 100644
View file @f3bacae
+++ b/KoBERT/kobert/pytorch_kobert.py 0 → 100644
View file @f3bacae
+# coding=utf-8
+# Copyright 2019 SK T-Brain Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import requests
+import hashlib
+
+import torch
+
+from transformers import BertModel, BertConfig
+import gluonnlp as nlp
+
+from .utils import download as _download
+from .utils import tokenizer
+
+pytorch_kobert = {
+    'url':
+    'https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params',
+    'fname': 'pytorch_kobert_2439f391a6.params',
+    'chksum': '2439f391a6'
+}
+
+bert_config = {
+    'attention_probs_dropout_prob': 0.1,
+    'hidden_act': 'gelu',
+    'hidden_dropout_prob': 0.1,
+    'hidden_size': 768,
+    'initializer_range': 0.02,
+    'intermediate_size': 3072,
+    'max_position_embeddings': 512,
+    'num_attention_heads': 12,
+    'num_hidden_layers': 12,
+    'type_vocab_size': 2,
+    'vocab_size': 8002
+}
+
+
+def get_pytorch_kobert_model(ctx='cpu', cachedir='~/kobert/'):
+    # download model
+    model_info = pytorch_kobert
+    model_path = _download(model_info['url'],
+                           model_info['fname'],
+                           model_info['chksum'],
+                           cachedir=cachedir)
+    # download vocab
+    vocab_info = tokenizer
+    vocab_path = _download(vocab_info['url'],
+                           vocab_info['fname'],
+                           vocab_info['chksum'],
+                           cachedir=cachedir)
+    return get_kobert_model(model_path, vocab_path, ctx)
+
+
+def get_kobert_model(model_file, vocab_file, ctx="cpu"):
+    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
+    bertmodel.load_state_dict(torch.load(model_file))
+    #bertmodel = bertmodel.from_pretrained('https://kobert.blob.core.windows.net/models/kobert/pytorch/pytorch_kobert_2439f391a6.params', output_hidden_states=True)
+    device = torch.device(ctx)
+    bertmodel.to(device)
+    bertmodel.eval()
+    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
+                                                         padding_token='[PAD]')
+    return bertmodel, vocab_b_obj
--- a/KoBERT/kobert/utils.py 0 → 100644
View file @f3bacae
+++ b/KoBERT/kobert/utils.py 0 → 100644
View file @f3bacae
+# coding=utf-8
+# Copyright 2019 SK T-Brain Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import requests
+import hashlib
+
+onnx_kobert = {
+    'url':
+    'https://kobert.blob.core.windows.net/models/kobert/onnx/onnx_kobert_44529811f0.onnx',
+    'fname': 'onnx_kobert_44529811f0.onnx',
+    'chksum': '44529811f0'
+}
+
+tokenizer = {
+    'url':
+    'https://kobert.blob.core.windows.net/models/kobert/tokenizer/kobert_news_wiki_ko_cased-ae5711deb3.spiece',
+    'fname': 'kobert_news_wiki_ko_cased-1087f8699e.spiece',
+    'chksum': 'ae5711deb3'
+}
+
+
+def download(url, filename, chksum, cachedir='~/kobert/'):
+    f_cachedir = os.path.expanduser(cachedir)
+    os.makedirs(f_cachedir, exist_ok=True)
+    file_path = os.path.join(f_cachedir, filename)
+    if os.path.isfile(file_path):
+        if hashlib.md5(open(file_path,
+                            'rb').read()).hexdigest()[:10] == chksum:
+            print('using cached model')
+            return file_path
+    with open(file_path, 'wb') as f:
+        response = requests.get(url, stream=True)
+        total = response.headers.get('content-length')
+
+        if total is None:
+            f.write(response.content)
+        else:
+            downloaded = 0
+            total = int(total)
+            for data in response.iter_content(
+                    chunk_size=max(int(total / 1000), 1024 * 1024)):
+                downloaded += len(data)
+                f.write(data)
+                done = int(50 * downloaded / total)
+                sys.stdout.write('\r[{}{}]'.format('█' * done,
+                                                   '.' * (50 - done)))
+                sys.stdout.flush()
+    sys.stdout.write('\n')
+    assert chksum == hashlib.md5(open(
+        file_path, 'rb').read()).hexdigest()[:10], 'corrupted file!'
+    return file_path
+
+
+def get_onnx(cachedir='~/kobert/'):
+    """Get KoBERT ONNX file path after downloading
+    """
+    model_info = onnx_kobert
+    return download(model_info['url'],
+                    model_info['fname'],
+                    model_info['chksum'],
+                    cachedir=cachedir)
+
+
+def get_tokenizer(cachedir='~/kobert/'):
+    """Get KoBERT Tokenizer file path after downloading
+    """
+    model_info = tokenizer
+    return download(model_info['url'],
+                    model_info['fname'],
+                    model_info['chksum'],
+                    cachedir=cachedir)