GetTopic.py
1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import csv
import re
import matplotlib.pyplot as plt
import numpy as np
from konlpy.tag import Okt
from konlpy.tag import Komoran
from PIL import Image
from textrank import KeywordSummarizer
from wordcloud import WordCloud, get_single_color_func
okt = Okt()
def Okt_tokenizer(sent):
words = okt.nouns(sent)
return words
komoran = Komoran()
def komoran_tokenizer(sent):
words = komoran.pos(sent, join=True)
words = [w for w in words if('/NNG' in w or '/NNP' in w) and (len(w) > 5)]
return words
def color_func(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(0, 100%, {:d}%)".format(np.random.randint(40,65)))
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
def GetKeywords():
posts = []
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
reader = csv.reader(db)
for data in reader:
posts.append(data[0] + data[1])
keyword_extractor = KeywordSummarizer(
tokenize=komoran_tokenizer,
window = -1,
verbose= False
)
keywords = keyword_extractor.summarize(posts, topk=100)
return keywords
def GetWordCloud():
keywords = GetKeywords()
texts = {}
for word in keywords:
texts[word[0][0:-4]] = word[1]
khu_mask = np.array(Image.open('./static/images/khu_lion.png'))
wordcloud = WordCloud(
font_path = './static/fonts/NanumGothicBold.ttf',
width = 800,
height = 600,
background_color="white",
mask = khu_mask,
color_func = color_func
)
wordcloud = wordcloud.generate_from_frequencies(texts)
array = wordcloud.to_array()
fig = plt.figure(figsize=(10,10))
plt.imshow(array, interpolation="bilinear")
plt.axis("off")
plt.show()
fig.savefig('./static/images/wordcloud.png')
GetWordCloud()