Added Utils

김민수
Commit 7c2fd3c187057a7945fb049e6bd88c6131d41dd7 7c2fd3c1 1 parent c1ef9d98
Showing 4 changed files with 112 additions and 3 deletions
code/preparation/sys_sampling.py
code/util_test.py
code/utils/data_loader.py
readme.md
--- a/code/preparation/sys_sampling.py 0 → 100644
View file @7c2fd3c
+++ b/code/preparation/sys_sampling.py 0 → 100644
View file @7c2fd3c
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from tqdm import tqdm
+
+categories = ['NWRW19','NPRW19','NLRW19','NIRW19']
+for category in categories:
+    table=pq.read_table(f'categorized_parquet/category={category}')
+    labels=[]
+    index=0
+    last_topic=''
+    for topic in tqdm(table['topic']):
+        if topic!=last_topic:
+            index=0
+            last_topic=topic
+        mod=index %100
+        if mod==49:
+            labels.append('valid')
+        elif mod==99:
+            labels.append('test')
+        else:
+            labels.append('train')
+        index+=1
+    pq.write_to_dataset( table.append_column('label',pa.array(labels)), root_path='dataset',
+                    partition_cols=['topic', 'label'],coerce_timestamps='us')
\ No newline at end of file
--- a/code/util_test.py 0 → 100644
View file @7c2fd3c
+++ b/code/util_test.py 0 → 100644
View file @7c2fd3c
+import os
+from util.data_loader import ArticleDataset, ToTensor
+from gluonnlp.vocab import BERTVocab
+from gluonnlp.data import SentencepieceTokenizer
+from kogpt2.utils import get_tokenizer
+
+tokenizer = SentencepieceTokenizer(get_tokenizer(),  num_best=0, alpha=0)
+vocab_file = os.path.join(os.path.expanduser('/code/model'), 'kogpt2_news_wiki_ko_cased_818bfa919d.spiece')
+vocab=BERTVocab.from_sentencepiece(vocab_file,
+                                                         mask_token=None,
+                                                         sep_token=None,
+                                                         cls_token=None,
+                                                         unknown_token='<unk>',
+                                                         padding_token='<pad>',
+                                                         bos_token='<s>',
+                                                         eos_token='</s>')
+dataset=ArticleDataset('/dataset')
+for i, (data, topic) in enumerate(dataset):
+    print(i, topic)
+    title=data[0].as_py()
+    print('origin:',title)
+    
+    print('tokenized origin:',tokenizer(title))
+    filtered=''.join(c if c.isalnum() else ' ' for c in title)
+    print('filtered:',filtered)
+    print('tokenized filtered:',tokenizer(filtered))
+    print('Transformed:',  ToTensor(tokenizer, vocab)((data,topic)))
+    if i>=100:
+        break
--- a/code/utils/data_loader.py 0 → 100644
View file @7c2fd3c
+++ b/code/utils/data_loader.py 0 → 100644
View file @7c2fd3c
+import os
+import pyarrow as pa
+import pyarrow.parquet as pq
+import torch
+from torch.utils.data import Dataset
+from kogpt2.utils import get_tokenizer
+
+class ArticleDataset(Dataset):
+    """
+    기사 학습을 위한 데이터셋
+    dataset for learn articles
+    """
+    def __init__(self, dataset_path:str, topics:list=['경제', '문화', '미용_건강', '사회', '생활', '스포츠', '연예', '정치', 'IT_과학'], label:str='train'):
+        """
+        Initializer
+        :param dataset_path: path of parquet dataset
+        :param topic: if not None, only use specified topics; must be sublist of [경제, 문화, 미용_건강, 사회, 생활, 스포츠, 연예, 정치, IT_과학]
+        :param label: specify type of dataset; must be one of [train, test, valid] (default is train)
+        """
+        expanded_dataset_path = os.path.expanduser(dataset_path)
+        tables=[]
+        for topic in topics:
+            table=pq.read_table(f'{expanded_dataset_path}/topic={topic}/label={label}',columns=['paragraph'])
+            tables.append(table.append_column('topic',pa.array([topic]*len(table))))
+        self.data=pa.concat_tables(tables)
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self,index):
+        return self.data['paragraph'][index], self.data['topic'][index]
+
+class ToTensor(object):
+    """
+    Convert Article dataset paragraph to Tensor using tokenizer
+    """
+    def __init__(self, tokenizer, vocab):
+        self.tokenizer=tokenizer
+        self.vocab=vocab
+    
+    def __call__(self, sample):
+        tokens=[]
+        for i, sentence in enumerate(sample[0]): 
+            if i==0:
+                tokens+=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sample[1].as_py())+self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
+            else:
+                tokens+=[self.vocab[self.vocab.bos_token]]+self.vocab[self.tokenizer(sentence.as_py())]+[self.vocab[self.vocab.eos_token]]
+        return torch.Tensor(tokens)
\ No newline at end of file
--- a/readme.md
View file @7c2fd3c
+++ b/readme.md
View file @7c2fd3c
@@ -3,10 +3,17 @@
 목표: 10년 간(2009~2018)의 신문기사를 분석하고, KoGPT2 모델을 학습시켜 신문기사를 생성합니다.
 ## report
 보고서가 들어가는 디렉토리입니다.
+추후 파일명을 정돈할 계획입니다.
 ## code
-python으로 작성된 말뭉치 분석도구입니다.
+학습을 위한 python코드들입니다.
-전처리, 모델링, 분석, 시각화 순으로 만들 예정입니다.  
+하위 디렉토리로 데이터 전처리(preparation), 편의성 도구(utils), 모델 및 캐시(model)가 있습니다.  
+모두의 말뭉치 이용 약관상 전처리된 데이터는 제공하지 않습니다.
 ## reference
-추가 예정
+[Github KoGPT2](https://github.com/SKT-AI/KoGPT2 "SKT-AI/KoGPT2")\
+[Github KoGPT2 가사 생성](https://github.com/gyunggyung/KoGPT2-FineTuning "gyunggyung/KoGPT2-FineTuning")\
+[Github KoGPT2 챗봇](https://github.com/haven-jeon/KoGPT2-chatbot "haven-jeon/KoGPT2-chatbot")\
+[Github KoGPT2 이야기 생성](https://github.com/shbictai/narrativeKoGPT2 "shbictai/narrativeKoGPT2")
+
+_추가 예정_