박은주

수정: 전문 크롤링

...@@ -20,3 +20,7 @@ chromedriver.exe ...@@ -20,3 +20,7 @@ chromedriver.exe
20 20
21 /KoreanSentimentAnalyzer/.gitignore 21 /KoreanSentimentAnalyzer/.gitignore
22 /KoreanSentimentAnalyzer/.git/ 22 /KoreanSentimentAnalyzer/.git/
23 +/textrank/
24 +/textrank.egg-info/
25 +/build/
26 +/dist/
......
1 import os 1 import os
2 import csv 2 import csv
3 +import re
3 4
4 from sklearn.metrics.pairwise import cosine_similarity 5 from sklearn.metrics.pairwise import cosine_similarity
5 from sklearn.feature_extraction.text import TfidfVectorizer 6 from sklearn.feature_extraction.text import TfidfVectorizer
6 from sklearn.cluster import KMeans 7 from sklearn.cluster import KMeans
7 from konlpy.tag import Okt 8 from konlpy.tag import Okt
9 +from konlpy.tag import Komoran
10 +
11 +from textrank import KeywordSummarizer
8 12
9 okt = Okt() 13 okt = Okt()
10 -def DocToNouns(docs): 14 +def Okt_tokenizer(sent):
11 - return [{ 15 + words = okt.nouns(sent)
12 - 'id': i, 16 + # words = okt.pos(sent, join=True, stem=True)
13 - 'nouns': ' '.join(okt.nouns(doc)), 17 + # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
14 - } for i, doc in enumerate(docs)] 18 + return words
19 +
20 +komoran = Komoran()
21 +def komoran_tokenizer(sent):
22 + # words = []
23 + # for sentence in sent:
24 + # words += komoran.pos(sentence, join=True)
25 + # print("check : ", komoran.pos(sentence, join=True))
26 + # words = [komoran.pos(sentence, join=True) for sentence in sent]
27 + words = komoran.pos(sent, join=True)
28 + words = [w for w in words if('/NNG' in w or '/NNP' in w)]
29 + return words
15 30
16 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 31 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
17 32
18 posts = [] 33 posts = []
19 -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8-sig') as db: 34 +with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
20 reader = csv.reader(db) 35 reader = csv.reader(db)
21 for data in reader: 36 for data in reader:
22 - posts.append(data) 37 + data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
38 + data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
39 + posts.append(data[0] + data[1])
40 +
41 +# tfidf_vectorizer = TfidfVectorizer()
42 +# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
43 +# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
44 +keyword_extractor = KeywordSummarizer(
45 + # tokenize=Okt_tokenizer,
46 + tokenize=komoran_tokenizer,
47 + window = -1,
48 + verbose= False
49 +)
23 50
24 -nouns = DocToNouns(posts)
25 -print(nouns)
...\ No newline at end of file ...\ No newline at end of file
51 +keywords = keyword_extractor.summarize(posts, topk=30)
52 +print(keywords)
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -10,7 +10,12 @@ ...@@ -10,7 +10,12 @@
10 * konlpy 10 * konlpy
11 * jpype 11 * jpype
12 12
13 +## Package
14 +> TextRank Package https://github.com/lovit/textrank/ <br>
15 +> Sentiment Analyzer https://github.com/mrlee23/KoreanSentimentAnalyzer
16 +
13 17
14 ## WORK 18 ## WORK
15 -[] everytime.kr(경희대)로부터 24시간 이내의 이슈를 정리 ; 주기능
16 -[] 질문 입력시 해당하는 게시글 검색 ; 챗봇
...\ No newline at end of file ...\ No newline at end of file
19 +- [ ] everytime.kr(경희대)로부터 24시간 이내의 이슈 토픽 정리 ; 주기능<br>
20 +- [ ] 질문 입력시 해당하는 게시글 검색<br>
21 +- [ ] 지난 24시간 이내의 게시판 분위기 정리
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -19,12 +19,13 @@ def Click(xpath): ...@@ -19,12 +19,13 @@ def Click(xpath):
19 sleeptime() 19 sleeptime()
20 20
21 login_info = { 21 login_info = {
22 - 'userID' : 'qdw0313', 22 + 'userID' : 'id',
23 - 'userpw' : 'fejUfrQxHWwtcGcP0' 23 + 'userpw' : '********'
24 +
24 } 25 }
25 26
26 options = webdriver.ChromeOptions() 27 options = webdriver.ChromeOptions()
27 -options.add_argument('headless') 28 +# options.add_argument('headless')
28 options.add_argument('no-sandbox') 29 options.add_argument('no-sandbox')
29 options.add_argument('window-size=1920x1080') 30 options.add_argument('window-size=1920x1080')
30 options.add_argument('disable-gpu') 31 options.add_argument('disable-gpu')
...@@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage') ...@@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage')
32 options.add_argument('lang=ko_KR') 33 options.add_argument('lang=ko_KR')
33 options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') 34 options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
34 35
35 -driver = webdriver.Chrome(r'C:\Users\Admin\Desktop\OSS\Todays_Issue\chromedriver.exe', options=options) 36 +driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
36 37
37 driver.get('about:blank') 38 driver.get('about:blank')
38 driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") 39 driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
...@@ -72,10 +73,7 @@ while swt: ...@@ -72,10 +73,7 @@ while swt:
72 73
73 TitleList = soup.select('#container > div.wrap.articles > article > a > h2') 74 TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
74 DateList = soup.select('#container > div.wrap.articles > article > a > time') 75 DateList = soup.select('#container > div.wrap.articles > article > a > time')
75 - # ContentList = soup.select('#container > div.wrap.articles > article > a > p')
76 76
77 - # idx = 1
78 - # for post in zip(TitleList, ContentList, DateList):
79 for post in zip(TitleList, DateList): 77 for post in zip(TitleList, DateList):
80 title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) 78 title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
81 try: 79 try:
...@@ -94,7 +92,6 @@ while swt: ...@@ -94,7 +92,6 @@ while swt:
94 # print("{0}. {1} : {2}".format(idx, title, content)) 92 # print("{0}. {1} : {2}".format(idx, title, content))
95 print(post[1].text) 93 print(post[1].text)
96 if post[1].text < yesterday: 94 if post[1].text < yesterday:
97 - swt = False
98 break 95 break
99 96
100 post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig') 97 post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
......