TextRank를 이용하여 메인 토픽(최대 Top 30) 추출

박은주
Commit fc372ad628e1738d9d78e77b0367775a099b6f06 fc372ad6 1 parent b17f3f76
Showing 2 changed files with 40 additions and 9 deletions
.gitignore
GetTopic.py
--- a/.gitignore
View file @fc372ad
+++ b/.gitignore
View file @fc372ad
@@ -14,3 +14,7 @@ chromedriver.exe
 *.iml
 *.csv
 *.xml
+/textrank/
+/textrank.egg-info/
+/build/
+/dist/
\ No newline at end of file
--- a/GetTopic.py
View file @fc372ad
+++ b/GetTopic.py
View file @fc372ad
 import os
 import csv
+import re
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from konlpy.tag import Okt
+from konlpy.tag import Komoran
+
+from textrank import KeywordSummarizer
 okt = Okt()
-def DocToNouns(docs):
+def Okt_tokenizer(sent):
-    return [{
+    words = okt.nouns(sent)
-        'id': i,
+    # words = okt.pos(sent, join=True, stem=True)
-        'nouns': ' '.join(okt.nouns(doc)),
+    # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
-    } for i, doc in enumerate(docs)]
+    return words
+
+komoran = Komoran()
+def komoran_tokenizer(sent):
+    # words = []
+    # for sentence in sent:
+    #     words += komoran.pos(sentence, join=True)
+    #     print("check : ", komoran.pos(sentence, join=True))
+    # words = [komoran.pos(sentence, join=True) for sentence in sent]
+    words = komoran.pos(sent, join=True)
+    words = [w for w in words if('/NNG' in w or '/NNP' in w)]
+    return words
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 posts = []
-with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db:
+with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
     reader = csv.reader(db)
     for data in reader:
-        posts.append(data)
+        data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
+        data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
+        posts.append(data[0] + data[1])
+
+# tfidf_vectorizer = TfidfVectorizer()
+# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
+# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
+keyword_extractor = KeywordSummarizer(
+    # tokenize=Okt_tokenizer,
+    tokenize=komoran_tokenizer,
+    window = -1,
+    verbose= False
+)
-nouns = DocToNouns(posts)
-print(nouns)
\ No newline at end of file
+keywords = keyword_extractor.summarize(posts, topk=30)
+print(keywords)
\ No newline at end of file