Showing
2 changed files
with
40 additions
and
9 deletions
1 | import os | 1 | import os |
2 | import csv | 2 | import csv |
3 | +import re | ||
3 | 4 | ||
4 | from sklearn.metrics.pairwise import cosine_similarity | 5 | from sklearn.metrics.pairwise import cosine_similarity |
5 | from sklearn.feature_extraction.text import TfidfVectorizer | 6 | from sklearn.feature_extraction.text import TfidfVectorizer |
6 | from sklearn.cluster import KMeans | 7 | from sklearn.cluster import KMeans |
7 | from konlpy.tag import Okt | 8 | from konlpy.tag import Okt |
9 | +from konlpy.tag import Komoran | ||
10 | + | ||
11 | +from textrank import KeywordSummarizer | ||
8 | 12 | ||
9 | okt = Okt() | 13 | okt = Okt() |
10 | -def DocToNouns(docs): | 14 | +def Okt_tokenizer(sent): |
11 | - return [{ | 15 | + words = okt.nouns(sent) |
12 | - 'id': i, | 16 | + # words = okt.pos(sent, join=True, stem=True) |
13 | - 'nouns': ' '.join(okt.nouns(doc)), | 17 | + # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)] |
14 | - } for i, doc in enumerate(docs)] | 18 | + return words |
19 | + | ||
20 | +komoran = Komoran() | ||
21 | +def komoran_tokenizer(sent): | ||
22 | + # words = [] | ||
23 | + # for sentence in sent: | ||
24 | + # words += komoran.pos(sentence, join=True) | ||
25 | + # print("check : ", komoran.pos(sentence, join=True)) | ||
26 | + # words = [komoran.pos(sentence, join=True) for sentence in sent] | ||
27 | + words = komoran.pos(sent, join=True) | ||
28 | + words = [w for w in words if('/NNG' in w or '/NNP' in w)] | ||
29 | + return words | ||
15 | 30 | ||
16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 31 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
17 | 32 | ||
18 | posts = [] | 33 | posts = [] |
19 | -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db: | 34 | +with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db: |
20 | reader = csv.reader(db) | 35 | reader = csv.reader(db) |
21 | for data in reader: | 36 | for data in reader: |
22 | - posts.append(data) | 37 | + data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '') |
38 | + data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '') | ||
39 | + posts.append(data[0] + data[1]) | ||
40 | + | ||
41 | +# tfidf_vectorizer = TfidfVectorizer() | ||
42 | +# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns) | ||
43 | +# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns) | ||
44 | +keyword_extractor = KeywordSummarizer( | ||
45 | + # tokenize=Okt_tokenizer, | ||
46 | + tokenize=komoran_tokenizer, | ||
47 | + window = -1, | ||
48 | + verbose= False | ||
49 | +) | ||
23 | 50 | ||
24 | -nouns = DocToNouns(posts) | ||
25 | -print(nouns) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
51 | +keywords = keyword_extractor.summarize(posts, topk=30) | ||
52 | +print(keywords) | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment