박은주

Crawler Modified

...@@ -15,7 +15,6 @@ chromedriver.exe ...@@ -15,7 +15,6 @@ chromedriver.exe
15 *.csv 15 *.csv
16 *.xml 16 *.xml
17 *.json 17 *.json
18 -*.png
19 *.org 18 *.org
20 19
21 /KoreanSentimentAnalyzer/.gitignore 20 /KoreanSentimentAnalyzer/.gitignore
......
...@@ -2,9 +2,6 @@ import os ...@@ -2,9 +2,6 @@ import os
2 import csv 2 import csv
3 import re 3 import re
4 4
5 -from sklearn.metrics.pairwise import cosine_similarity
6 -from sklearn.feature_extraction.text import TfidfVectorizer
7 -from sklearn.cluster import KMeans
8 from konlpy.tag import Okt 5 from konlpy.tag import Okt
9 from konlpy.tag import Komoran 6 from konlpy.tag import Komoran
10 7
...@@ -13,40 +10,30 @@ from textrank import KeywordSummarizer ...@@ -13,40 +10,30 @@ from textrank import KeywordSummarizer
13 okt = Okt() 10 okt = Okt()
14 def Okt_tokenizer(sent): 11 def Okt_tokenizer(sent):
15 words = okt.nouns(sent) 12 words = okt.nouns(sent)
16 - # words = okt.pos(sent, join=True, stem=True)
17 - # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
18 return words 13 return words
19 14
20 komoran = Komoran() 15 komoran = Komoran()
21 def komoran_tokenizer(sent): 16 def komoran_tokenizer(sent):
22 - # words = []
23 - # for sentence in sent:
24 - # words += komoran.pos(sentence, join=True)
25 - # print("check : ", komoran.pos(sentence, join=True))
26 - # words = [komoran.pos(sentence, join=True) for sentence in sent]
27 words = komoran.pos(sent, join=True) 17 words = komoran.pos(sent, join=True)
28 words = [w for w in words if('/NNG' in w or '/NNP' in w)] 18 words = [w for w in words if('/NNG' in w or '/NNP' in w)]
29 return words 19 return words
30 20
31 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 21 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
32 22
33 -posts = [] 23 +def GetKeywords():
34 -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db: 24 + posts = []
25 + with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
35 reader = csv.reader(db) 26 reader = csv.reader(db)
36 for data in reader: 27 for data in reader:
37 data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '') 28 data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
38 data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '') 29 data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
39 posts.append(data[0] + data[1]) 30 posts.append(data[0] + data[1])
40 31
41 -# tfidf_vectorizer = TfidfVectorizer() 32 + keyword_extractor = KeywordSummarizer(
42 -# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
43 -# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
44 -keyword_extractor = KeywordSummarizer(
45 - # tokenize=Okt_tokenizer,
46 tokenize=komoran_tokenizer, 33 tokenize=komoran_tokenizer,
47 window = -1, 34 window = -1,
48 verbose= False 35 verbose= False
49 -) 36 + )
50 37
51 -keywords = keyword_extractor.summarize(posts, topk=30)
52 -print(keywords)
...\ No newline at end of file ...\ No newline at end of file
38 + keywords = keyword_extractor.summarize(posts, topk=30)
39 + return keywords
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException ...@@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException
10 from bs4 import BeautifulSoup 10 from bs4 import BeautifulSoup
11 from datetime import datetime, timedelta 11 from datetime import datetime, timedelta
12 12
13 +BASE_DIR = os.path.dirname(os.path.realpath(__file__))
14 +
13 def sleeptime(): 15 def sleeptime():
14 time.sleep(random.randint(1, 3)) 16 time.sleep(random.randint(1, 3))
15 17
...@@ -18,43 +20,44 @@ def Click(xpath): ...@@ -18,43 +20,44 @@ def Click(xpath):
18 driver.execute_script("arguments[0].click();", element) 20 driver.execute_script("arguments[0].click();", element)
19 sleeptime() 21 sleeptime()
20 22
21 -login_info = { 23 +def GetData():
22 - 'userID' : 'id', 24 + login_info = {
23 - 'userpw' : '********' 25 + 'userID' : 'qdw0313',
24 - 26 + 'userpw' : 'fejUfrQxHWwtcGcP0'
25 -} 27 + }
26 - 28 +
27 -options = webdriver.ChromeOptions() 29 + options = webdriver.ChromeOptions()
28 -# options.add_argument('headless') 30 + options.add_argument('headless')
29 -options.add_argument('no-sandbox') 31 + options.add_argument('no-sandbox')
30 -options.add_argument('window-size=1920x1080') 32 + options.add_argument('window-size=1920x1080')
31 -options.add_argument('disable-gpu') 33 + options.add_argument('disable-gpu')
32 -options.add_argument('disable-dev-shm-usage') 34 + options.add_argument('disable-dev-shm-usage')
33 -options.add_argument('lang=ko_KR') 35 + options.add_argument('lang=ko_KR')
34 -options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') 36 + options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
35 - 37 +
36 -driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options) 38 + # driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
37 - 39 + driver = webdriver.Chrome(BASE_DIR + '/chromedriver.exe', options=options)
38 -driver.get('about:blank') 40 +
39 -driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") 41 + driver.get('about:blank')
40 -sleeptime() 42 + driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
41 -driver.get('https://everytime.kr/login') 43 + sleeptime()
42 - 44 + driver.get('https://everytime.kr/login')
43 -sleeptime() 45 +
44 -driver.find_element_by_name('userid').send_keys(login_info['userID']) 46 + sleeptime()
45 -driver.find_element_by_name('password').send_keys(login_info['userpw']) 47 + driver.find_element_by_name('userid').send_keys(login_info['userID'])
46 -driver.find_element_by_class_name('submit').click() 48 + driver.find_element_by_name('password').send_keys(login_info['userpw'])
47 -sleeptime() 49 + driver.find_element_by_class_name('submit').click()
48 - 50 + sleeptime()
49 -# 국제캠 자게 51 +
50 -sleeptime() 52 + # 국제캠 자게
51 -yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') 53 + sleeptime()
52 -print(yesterday) 54 + yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
53 -swt = True 55 + print(yesterday)
54 -page = 0 56 + swt = True
55 - 57 + page = 0
56 -post_df = pd.DataFrame(columns=['title', 'content']) 58 +
57 -while swt: 59 + post_df = pd.DataFrame(columns=['title', 'content'])
60 + while swt:
58 if page < 1: 61 if page < 1:
59 Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') 62 Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
60 page += 1 63 page += 1
...@@ -91,10 +94,13 @@ while swt: ...@@ -91,10 +94,13 @@ while swt:
91 columns=['title', 'content'])) 94 columns=['title', 'content']))
92 # print("{0}. {1} : {2}".format(idx, title, content)) 95 # print("{0}. {1} : {2}".format(idx, title, content))
93 print(post[1].text) 96 print(post[1].text)
94 - if post[1].text < yesterday: 97 + print(yesterday < "06/02 16:35")
98 + exit()
99 + if post[1].text <= yesterday:
95 break 100 break
96 101
97 -post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
98 -print("CVS file saved")
99 -post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
100 -print("JSON file saved")
...\ No newline at end of file ...\ No newline at end of file
102 + post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False)
103 + print("CVS file saved")
104 + with open('data.json', 'w', encoding='utf-8') as file:
105 + post_df.to_json(file, force_ascii=False)
106 + print("JSON file saved")
...\ No newline at end of file ...\ No newline at end of file
......