GetTopic.py
1.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
import csv
import re
from konlpy.tag import Okt
from konlpy.tag import Komoran
from textrank import KeywordSummarizer
okt = Okt()
def Okt_tokenizer(sent):
words = okt.nouns(sent)
return words
komoran = Komoran()
def komoran_tokenizer(sent):
words = komoran.pos(sent, join=True)
words = [w for w in words if('/NNG' in w or '/NNP' in w)]
return words
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
def GetKeywords():
posts = []
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
reader = csv.reader(db)
for data in reader:
data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
posts.append(data[0] + data[1])
keyword_extractor = KeywordSummarizer(
tokenize=komoran_tokenizer,
window = -1,
verbose= False
)
keywords = keyword_extractor.summarize(posts, topk=30)
return keywords