박은주

TextRank를 이용하여 메인 토픽(최대 Top 30) 추출

......@@ -14,3 +14,7 @@ chromedriver.exe
*.iml
*.csv
*.xml
/textrank/
/textrank.egg-info/
/build/
/dist/
\ No newline at end of file
......
import os
import csv
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from konlpy.tag import Okt
from konlpy.tag import Komoran
from textrank import KeywordSummarizer
okt = Okt()
def DocToNouns(docs):
return [{
'id': i,
'nouns': ' '.join(okt.nouns(doc)),
} for i, doc in enumerate(docs)]
def Okt_tokenizer(sent):
words = okt.nouns(sent)
# words = okt.pos(sent, join=True, stem=True)
# words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
return words
komoran = Komoran()
def komoran_tokenizer(sent):
# words = []
# for sentence in sent:
# words += komoran.pos(sentence, join=True)
# print("check : ", komoran.pos(sentence, join=True))
# words = [komoran.pos(sentence, join=True) for sentence in sent]
words = komoran.pos(sent, join=True)
words = [w for w in words if('/NNG' in w or '/NNP' in w)]
return words
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
posts = []
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db:
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
reader = csv.reader(db)
for data in reader:
posts.append(data)
data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
posts.append(data[0] + data[1])
# tfidf_vectorizer = TfidfVectorizer()
# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
keyword_extractor = KeywordSummarizer(
# tokenize=Okt_tokenizer,
tokenize=komoran_tokenizer,
window = -1,
verbose= False
)
nouns = DocToNouns(posts)
print(nouns)
\ No newline at end of file
keywords = keyword_extractor.summarize(posts, topk=30)
print(keywords)
\ No newline at end of file
......