Showing
4 changed files
with
52 additions
and
19 deletions
1 | import os | 1 | import os |
2 | import csv | 2 | import csv |
3 | +import re | ||
3 | 4 | ||
4 | from sklearn.metrics.pairwise import cosine_similarity | 5 | from sklearn.metrics.pairwise import cosine_similarity |
5 | from sklearn.feature_extraction.text import TfidfVectorizer | 6 | from sklearn.feature_extraction.text import TfidfVectorizer |
6 | from sklearn.cluster import KMeans | 7 | from sklearn.cluster import KMeans |
7 | from konlpy.tag import Okt | 8 | from konlpy.tag import Okt |
9 | +from konlpy.tag import Komoran | ||
10 | + | ||
11 | +from textrank import KeywordSummarizer | ||
8 | 12 | ||
9 | okt = Okt() | 13 | okt = Okt() |
10 | -def DocToNouns(docs): | 14 | +def Okt_tokenizer(sent): |
11 | - return [{ | 15 | + words = okt.nouns(sent) |
12 | - 'id': i, | 16 | + # words = okt.pos(sent, join=True, stem=True) |
13 | - 'nouns': ' '.join(okt.nouns(doc)), | 17 | + # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)] |
14 | - } for i, doc in enumerate(docs)] | 18 | + return words |
19 | + | ||
20 | +komoran = Komoran() | ||
21 | +def komoran_tokenizer(sent): | ||
22 | + # words = [] | ||
23 | + # for sentence in sent: | ||
24 | + # words += komoran.pos(sentence, join=True) | ||
25 | + # print("check : ", komoran.pos(sentence, join=True)) | ||
26 | + # words = [komoran.pos(sentence, join=True) for sentence in sent] | ||
27 | + words = komoran.pos(sent, join=True) | ||
28 | + words = [w for w in words if('/NNG' in w or '/NNP' in w)] | ||
29 | + return words | ||
15 | 30 | ||
16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | 31 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
17 | 32 | ||
18 | posts = [] | 33 | posts = [] |
19 | -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db: | 34 | +with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db: |
20 | reader = csv.reader(db) | 35 | reader = csv.reader(db) |
21 | for data in reader: | 36 | for data in reader: |
22 | - posts.append(data) | 37 | + data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '') |
38 | + data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '') | ||
39 | + posts.append(data[0] + data[1]) | ||
40 | + | ||
41 | +# tfidf_vectorizer = TfidfVectorizer() | ||
42 | +# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns) | ||
43 | +# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns) | ||
44 | +keyword_extractor = KeywordSummarizer( | ||
45 | + # tokenize=Okt_tokenizer, | ||
46 | + tokenize=komoran_tokenizer, | ||
47 | + window = -1, | ||
48 | + verbose= False | ||
49 | +) | ||
23 | 50 | ||
24 | -nouns = DocToNouns(posts) | ||
25 | -print(nouns) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
51 | +keywords = keyword_extractor.summarize(posts, topk=30) | ||
52 | +print(keywords) | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -10,7 +10,12 @@ | ... | @@ -10,7 +10,12 @@ |
10 | * konlpy | 10 | * konlpy |
11 | * jpype | 11 | * jpype |
12 | 12 | ||
13 | +## Package | ||
14 | +> TextRank Package https://github.com/lovit/textrank/ <br> | ||
15 | +> Sentiment Analyzer https://github.com/mrlee23/KoreanSentimentAnalyzer | ||
16 | + | ||
13 | 17 | ||
14 | ## WORK | 18 | ## WORK |
15 | -[] everytime.kr(경희대)로부터 24시간 이내의 이슈를 정리 ; 주기능 | ||
16 | -[] 질문 입력시 해당하는 게시글 검색 ; 챗봇 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
19 | +- [ ] everytime.kr(경희대)로부터 24시간 이내의 이슈 토픽 정리 ; 주기능<br> | ||
20 | +- [ ] 질문 입력시 해당하는 게시글 검색<br> | ||
21 | +- [ ] 지난 24시간 이내의 게시판 분위기 정리 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -19,12 +19,13 @@ def Click(xpath): | ... | @@ -19,12 +19,13 @@ def Click(xpath): |
19 | sleeptime() | 19 | sleeptime() |
20 | 20 | ||
21 | login_info = { | 21 | login_info = { |
22 | - 'userID' : 'qdw0313', | 22 | + 'userID' : 'id', |
23 | - 'userpw' : 'fejUfrQxHWwtcGcP0' | 23 | + 'userpw' : '********' |
24 | + | ||
24 | } | 25 | } |
25 | 26 | ||
26 | options = webdriver.ChromeOptions() | 27 | options = webdriver.ChromeOptions() |
27 | -options.add_argument('headless') | 28 | +# options.add_argument('headless') |
28 | options.add_argument('no-sandbox') | 29 | options.add_argument('no-sandbox') |
29 | options.add_argument('window-size=1920x1080') | 30 | options.add_argument('window-size=1920x1080') |
30 | options.add_argument('disable-gpu') | 31 | options.add_argument('disable-gpu') |
... | @@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage') | ... | @@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage') |
32 | options.add_argument('lang=ko_KR') | 33 | options.add_argument('lang=ko_KR') |
33 | options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') | 34 | options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') |
34 | 35 | ||
35 | -driver = webdriver.Chrome(r'C:\Users\Admin\Desktop\OSS\Todays_Issue\chromedriver.exe', options=options) | 36 | +driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options) |
36 | 37 | ||
37 | driver.get('about:blank') | 38 | driver.get('about:blank') |
38 | driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") | 39 | driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") |
... | @@ -72,10 +73,7 @@ while swt: | ... | @@ -72,10 +73,7 @@ while swt: |
72 | 73 | ||
73 | TitleList = soup.select('#container > div.wrap.articles > article > a > h2') | 74 | TitleList = soup.select('#container > div.wrap.articles > article > a > h2') |
74 | DateList = soup.select('#container > div.wrap.articles > article > a > time') | 75 | DateList = soup.select('#container > div.wrap.articles > article > a > time') |
75 | - # ContentList = soup.select('#container > div.wrap.articles > article > a > p') | ||
76 | 76 | ||
77 | - # idx = 1 | ||
78 | - # for post in zip(TitleList, ContentList, DateList): | ||
79 | for post in zip(TitleList, DateList): | 77 | for post in zip(TitleList, DateList): |
80 | title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) | 78 | title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) |
81 | try: | 79 | try: |
... | @@ -94,7 +92,6 @@ while swt: | ... | @@ -94,7 +92,6 @@ while swt: |
94 | # print("{0}. {1} : {2}".format(idx, title, content)) | 92 | # print("{0}. {1} : {2}".format(idx, title, content)) |
95 | print(post[1].text) | 93 | print(post[1].text) |
96 | if post[1].text < yesterday: | 94 | if post[1].text < yesterday: |
97 | - swt = False | ||
98 | break | 95 | break |
99 | 96 | ||
100 | post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig') | 97 | post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig') | ... | ... |
-
Please register or login to post a comment