util_test.py
1.37 KB
import os
from util.data_loader import ArticleDataset, ToTensor
from gluonnlp.vocab import BERTVocab
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer
tokenizer = SentencepieceTokenizer(get_tokenizer(), num_best=0, alpha=0)
vocab_file = os.path.join(os.path.expanduser('/code/model'), 'kogpt2_news_wiki_ko_cased_818bfa919d.spiece')
vocab=BERTVocab.from_sentencepiece(vocab_file,
mask_token=None,
sep_token=None,
cls_token=None,
unknown_token='<unk>',
padding_token='<pad>',
bos_token='<s>',
eos_token='</s>')
dataset=ArticleDataset('/dataset')
for i, (data, topic) in enumerate(dataset):
print(i, topic)
title=data[0].as_py()
print('origin:',title)
print('tokenized origin:',tokenizer(title))
filtered=''.join(c if c.isalnum() else ' ' for c in title)
print('filtered:',filtered)
print('tokenized filtered:',tokenizer(filtered))
print('Transformed:', ToTensor(tokenizer, vocab)((data,topic)))
if i>=100:
break