김민수

Added Utils

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
categories = ['NWRW19','NPRW19','NLRW19','NIRW19']
for category in categories:
table=pq.read_table(f'categorized_parquet/category={category}')
labels=[]
index=0
last_topic=''
for topic in tqdm(table['topic']):
if topic!=last_topic:
index=0
last_topic=topic
mod=index %100
if mod==49:
labels.append('valid')
elif mod==99:
labels.append('test')
else:
labels.append('train')
index+=1
pq.write_to_dataset( table.append_column('label',pa.array(labels)), root_path='dataset',
partition_cols=['topic', 'label'],coerce_timestamps='us')
\ No newline at end of file
import os
from util.data_loader import ArticleDataset, ToTensor
from gluonnlp.vocab import BERTVocab
from gluonnlp.data import SentencepieceTokenizer
from kogpt2.utils import get_tokenizer
tokenizer = SentencepieceTokenizer(get_tokenizer(), num_best=0, alpha=0)
vocab_file = os.path.join(os.path.expanduser('/code/model'), 'kogpt2_news_wiki_ko_cased_818bfa919d.spiece')
vocab=BERTVocab.from_sentencepiece(vocab_file,
mask_token=None,
sep_token=None,
cls_token=None,
unknown_token='<unk>',
padding_token='<pad>',
bos_token='<s>',
eos_token='</s>')
dataset=ArticleDataset('/dataset')
for i, (data, topic) in enumerate(dataset):
print(i, topic)
title=data[0].as_py()
print('origin:',title)
print('tokenized origin:',tokenizer(title))
filtered=''.join(c if c.isalnum() else ' ' for c in title)
print('filtered:',filtered)
print('tokenized filtered:',tokenizer(filtered))
print('Transformed:', ToTensor(tokenizer, vocab)((data,topic)))
if i>=100:
break
import os
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from torch.utils.data import Dataset
from kogpt2.utils import get_tokenizer
class ArticleDataset(Dataset):
"""
기사 학습을 위한 데이터셋
dataset for learn articles
"""
def __init__(self, dataset_path:str, topics:list=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'], label:str='train'):
"""
Initializer
:param dataset_path: path of parquet dataset
:param topic: if not None, only use specified topics; must be sublist of [경제, 문화, 미용_건강, 사회, 생활, 스포츠, 연예, 정치, IT_과학]
:param label: specify type of dataset; must be one of [train, test, valid] (default is train)
"""
expanded_dataset_path = os.path.expanduser(dataset_path)
tables=[]
for topic in topics:
table=pq.read_table(f'{expanded_dataset_path}/topic={topic}/label={label}',columns=['paragraph'])
tables.append(table.append_column('topic',pa.array([topic]*len(table))))
self.data=pa.concat_tables(tables)
def __len__(self):
return len(self.data)
def __getitem__(self,index):
return self.data['paragraph'][index], self.data['topic'][index]
class ToTensor(object):
"""
Convert Article dataset paragraph to Tensor using tokenizer
"""
def __init__(self, tokenizer, vocab):
self.tokenizer=tokenizer
self.vocab=vocab
def __call__(self, sample):
tokens=[]
for i, sentence in enumerate(sample[0]):
if i==0:
tokens+=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sample[1].as_py())+self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
else:
tokens+=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
return torch.Tensor(tokens)
\ No newline at end of file
......@@ -3,10 +3,17 @@
목표: 10년 간(2009~2018)의 신문기사를 분석하고, KoGPT2 모델을 학습시켜 신문기사를 생성합니다.
## report
보고서가 들어가는 디렉토리입니다.
추후 파일명을 정돈할 계획입니다.
## code
python으로 작성된 말뭉치 분석도구입니다.
전처리, 모델링, 분석, 시각화 순으로 만들 예정입니다.
학습을 위한 python코드들입니다.
하위 디렉토리로 데이터 전처리(preparation), 편의성 도구(utils), 모델 및 캐시(model)가 있습니다.
모두의 말뭉치 이용 약관상 전처리된 데이터는 제공하지 않습니다.
## reference
추가 예정
[Github KoGPT2](https://github.com/SKT-AI/KoGPT2 "SKT-AI/KoGPT2")\
[Github KoGPT2 가사 생성](https://github.com/gyunggyung/KoGPT2-FineTuning "gyunggyung/KoGPT2-FineTuning")\
[Github KoGPT2 챗봇](https://github.com/haven-jeon/KoGPT2-chatbot "haven-jeon/KoGPT2-chatbot")\
[Github KoGPT2 이야기 생성](https://github.com/shbictai/narrativeKoGPT2 "shbictai/narrativeKoGPT2")
_추가 예정_
......