TextRank를 이용하여 메인 토픽(최대 Top 30) 추출

박은주
Commit fc372ad628e1738d9d78e77b0367775a099b6f06 fc372ad6 1 parent b17f3f76
Showing 2 changed files with 40 additions and 9 deletions
.gitignore
GetTopic.py
--- a/.gitignore
View file @fc372ad
+++ b/.gitignore
View file @fc372ad
@@ -14,3 +14,7 @@ chromedriver.exe
 *.iml
 *.csv
 *.xml
+ /textrank/
+ /textrank.egg-info/
+ /build/
+ /dist/
\ No newline at end of file
--- a/GetTopic.py
View file @fc372ad
+++ b/GetTopic.py
View file @fc372ad
 import os
 import csv
+ import re
 
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from konlpy.tag import Okt
+ from konlpy.tag import Komoran
+ 
+ from textrank import KeywordSummarizer
 
 okt = Okt()
- def DocToNouns(docs):
-     return [{
-         'id': i,
-         'nouns': ' '.join(okt.nouns(doc)),
-     } for i, doc in enumerate(docs)]
+ def Okt_tokenizer(sent):
+     words = okt.nouns(sent)
+     # words = okt.pos(sent, join=True, stem=True)
+     # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
+     return words
+ 
+ komoran = Komoran()
+ def komoran_tokenizer(sent):
+     # words = []
+     # for sentence in sent:
+     #     words += komoran.pos(sentence, join=True)
+     #     print("check : ", komoran.pos(sentence, join=True))
+     # words = [komoran.pos(sentence, join=True) for sentence in sent]
+     words = komoran.pos(sent, join=True)
+     words = [w for w in words if('/NNG' in w or '/NNP' in w)]
+     return words
 
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
 posts = []
- with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db:
+ with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
     reader = csv.reader(db)
     for data in reader:
-         posts.append(data)
+         data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
+         data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
+         posts.append(data[0] + data[1])
+ 
+ # tfidf_vectorizer = TfidfVectorizer()
+ # title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
+ # content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
+ keyword_extractor = KeywordSummarizer(
+     # tokenize=Okt_tokenizer,
+     tokenize=komoran_tokenizer,
+     window = -1,
+     verbose= False
+ )
 
- nouns = DocToNouns(posts)
- print(nouns)
\ No newline at end of file
+ keywords = keyword_extractor.summarize(posts, topk=30)
+ print(keywords)
\ No newline at end of file