박은주

TextRank를 이용하여 메인 토픽(최대 Top 30) 추출

...@@ -14,3 +14,7 @@ chromedriver.exe ...@@ -14,3 +14,7 @@ chromedriver.exe
14 *.iml 14 *.iml
15 *.csv 15 *.csv
16 *.xml 16 *.xml
17 +/textrank/
18 +/textrank.egg-info/
19 +/build/
20 +/dist/
...\ No newline at end of file ...\ No newline at end of file
......
1 import os 1 import os
2 import csv 2 import csv
3 +import re
3 4
4 from sklearn.metrics.pairwise import cosine_similarity 5 from sklearn.metrics.pairwise import cosine_similarity
5 from sklearn.feature_extraction.text import TfidfVectorizer 6 from sklearn.feature_extraction.text import TfidfVectorizer
6 from sklearn.cluster import KMeans 7 from sklearn.cluster import KMeans
7 from konlpy.tag import Okt 8 from konlpy.tag import Okt
9 +from konlpy.tag import Komoran
10 +
11 +from textrank import KeywordSummarizer
8 12
9 okt = Okt() 13 okt = Okt()
10 -def DocToNouns(docs): 14 +def Okt_tokenizer(sent):
11 - return [{ 15 + words = okt.nouns(sent)
12 - 'id': i, 16 + # words = okt.pos(sent, join=True, stem=True)
13 - 'nouns': ' '.join(okt.nouns(doc)), 17 + # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
14 - } for i, doc in enumerate(docs)] 18 + return words
19 +
20 +komoran = Komoran()
21 +def komoran_tokenizer(sent):
22 + # words = []
23 + # for sentence in sent:
24 + # words += komoran.pos(sentence, join=True)
25 + # print("check : ", komoran.pos(sentence, join=True))
26 + # words = [komoran.pos(sentence, join=True) for sentence in sent]
27 + words = komoran.pos(sent, join=True)
28 + words = [w for w in words if('/NNG' in w or '/NNP' in w)]
29 + return words
15 30
16 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 31 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
17 32
18 posts = [] 33 posts = []
19 -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db: 34 +with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
20 reader = csv.reader(db) 35 reader = csv.reader(db)
21 for data in reader: 36 for data in reader:
22 - posts.append(data) 37 + data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
38 + data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
39 + posts.append(data[0] + data[1])
40 +
41 +# tfidf_vectorizer = TfidfVectorizer()
42 +# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
43 +# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
44 +keyword_extractor = KeywordSummarizer(
45 + # tokenize=Okt_tokenizer,
46 + tokenize=komoran_tokenizer,
47 + window = -1,
48 + verbose= False
49 +)
23 50
24 -nouns = DocToNouns(posts)
25 -print(nouns)
...\ No newline at end of file ...\ No newline at end of file
51 +keywords = keyword_extractor.summarize(posts, topk=30)
52 +print(keywords)
...\ No newline at end of file ...\ No newline at end of file
......