GetTopic.py 1.82 KB

Raw Blame History Permalink

import os
import csv
import re

import matplotlib.pyplot as plt
import numpy as np
from konlpy.tag import Okt
from konlpy.tag import Komoran
from PIL import Image

from textrank import KeywordSummarizer
from wordcloud import WordCloud, get_single_color_func


okt = Okt()
def Okt_tokenizer(sent):
    words = okt.nouns(sent)
    return words

komoran = Komoran()
def komoran_tokenizer(sent):
    words = komoran.pos(sent, join=True)
    words = [w for w in words if('/NNG' in w or '/NNP' in w) and (len(w) > 5)]

    return words

def color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0, 100%, {:d}%)".format(np.random.randint(40,65)))

BASE_DIR = os.path.dirname(os.path.abspath(__file__))

def GetKeywords():
    posts = []
    with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
        reader = csv.reader(db)
        for data in reader:
            posts.append(data[0] + data[1])

    keyword_extractor = KeywordSummarizer(
        tokenize=komoran_tokenizer,
        window = -1,
        verbose= False
    )

    keywords = keyword_extractor.summarize(posts, topk=100)
    return keywords

def GetWordCloud():
    keywords = GetKeywords()
    texts = {}
    for word in keywords:
        texts[word[0][0:-4]] = word[1]

    khu_mask = np.array(Image.open('./static/images/khu_lion.png'))

    wordcloud = WordCloud(
        font_path = './static/fonts/NanumGothicBold.ttf',
        width = 800,
        height = 600,
        background_color="white",
        mask = khu_mask,
        color_func = color_func
    )
    wordcloud = wordcloud.generate_from_frequencies(texts)
    array = wordcloud.to_array()

    fig = plt.figure(figsize=(10,10))
    plt.imshow(array, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    fig.savefig('./static/images/wordcloud.png')