[Add] data preprocessing code

GyuhoLee
Commit 0c3fc8179d9937f4dae76620cf44a2e7c4b715e7 0c3fc817 1 parent a95a62fb
Showing 1 changed file with 46 additions and 0 deletions
src/pickle_to_csv.py
--- a/src/pickle_to_csv.py 0 → 100644
View file @0c3fc81
+++ b/src/pickle_to_csv.py 0 → 100644
View file @0c3fc81
+ import re, csv, pickle
+ from song import *
+ from PyKomoran import *
+ from textrank import KeywordSummarizer
+ 
+ def komoran_tokenize(sent):
+     words = sent.split()
+     words = [w for w in words if ('/NNP' in w or '/NNG' in w or '/SL' in w)]
+     return words
+ 
+ data = []
+ for filename in range(1112, 2122, 202):
+     with open(str(filename)+'.pickle', 'rb') as f:
+         tmp = pickle.load(f)
+     data.extend(tmp)
+ 
+ f = open('data.csv', 'w', newline='', encoding='UTF-8')
+ wr = csv.writer(f)
+ komoran = Komoran('STABLE')
+ 
+ for i in range(len(data)):
+     # 제목 정제
+     idx = data[i].title.find('(')
+     if idx != -1:
+         data[i].title = data[i].title[:idx]
+     # 가사 정제
+     if data[i].lyrics != '' and data[i].title != '거꾸로 걷는다':
+         texts = data[i].lyrics.split('\n')
+         sents = []
+         for text in texts:
+             tokened_text = komoran.get_plain_text(text)
+             if tokened_text != '':
+                 sents.append(tokened_text)
+         keyword_extractor = KeywordSummarizer(
+             tokenize = komoran_tokenize,
+             window = -1,
+             verbose = False
+         )
+         if len(sents) != 0:
+             keywords = keyword_extractor.summarize(sents, topk=5)
+             data[i].keywords = list(map(lambda x : x[0][:x[0].find('/')], keywords))
+ 
+     wr.writerow(data[i].getRow())
+     data[i].saveImg()
+ 
+ f.close()
\ No newline at end of file