util_test.py 1.37 KB
import os
from util.data_loader import ArticleDataset, ToTensor
from gluonnlp.vocab import BERTVocab
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer

tokenizer = SentencepieceTokenizer(get_tokenizer(),  num_best=0, alpha=0)
vocab_file = os.path.join(os.path.expanduser('/code/model'), 'kogpt2_news_wiki_ko_cased_818bfa919d.spiece')
vocab=BERTVocab.from_sentencepiece(vocab_file,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
dataset=ArticleDataset('/dataset')
for i, (data, topic) in enumerate(dataset):
    print(i, topic)
    title=data[0].as_py()
    print('origin:',title)
    
    print('tokenized origin:',tokenizer(title))
    filtered=''.join(c if c.isalnum() else ' ' for c in title)
    print('filtered:',filtered)
    print('tokenized filtered:',tokenizer(filtered))
    print('Transformed:',  ToTensor(tokenizer, vocab)((data,topic)))
    if i>=100:
        break