박은주

Crawler Modified

...@@ -15,7 +15,6 @@ chromedriver.exe ...@@ -15,7 +15,6 @@ chromedriver.exe
15 *.csv 15 *.csv
16 *.xml 16 *.xml
17 *.json 17 *.json
18 -*.png
19 *.org 18 *.org
20 19
21 /KoreanSentimentAnalyzer/.gitignore 20 /KoreanSentimentAnalyzer/.gitignore
......
...@@ -2,9 +2,6 @@ import os ...@@ -2,9 +2,6 @@ import os
2 import csv 2 import csv
3 import re 3 import re
4 4
5 -from sklearn.metrics.pairwise import cosine_similarity
6 -from sklearn.feature_extraction.text import TfidfVectorizer
7 -from sklearn.cluster import KMeans
8 from konlpy.tag import Okt 5 from konlpy.tag import Okt
9 from konlpy.tag import Komoran 6 from konlpy.tag import Komoran
10 7
...@@ -13,40 +10,30 @@ from textrank import KeywordSummarizer ...@@ -13,40 +10,30 @@ from textrank import KeywordSummarizer
13 okt = Okt() 10 okt = Okt()
14 def Okt_tokenizer(sent): 11 def Okt_tokenizer(sent):
15 words = okt.nouns(sent) 12 words = okt.nouns(sent)
16 - # words = okt.pos(sent, join=True, stem=True)
17 - # words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
18 return words 13 return words
19 14
20 komoran = Komoran() 15 komoran = Komoran()
21 def komoran_tokenizer(sent): 16 def komoran_tokenizer(sent):
22 - # words = []
23 - # for sentence in sent:
24 - # words += komoran.pos(sentence, join=True)
25 - # print("check : ", komoran.pos(sentence, join=True))
26 - # words = [komoran.pos(sentence, join=True) for sentence in sent]
27 words = komoran.pos(sent, join=True) 17 words = komoran.pos(sent, join=True)
28 words = [w for w in words if('/NNG' in w or '/NNP' in w)] 18 words = [w for w in words if('/NNG' in w or '/NNP' in w)]
29 return words 19 return words
30 20
31 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 21 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
32 22
33 -posts = []
34 -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
35 - reader = csv.reader(db)
36 - for data in reader:
37 - data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
38 - data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
39 - posts.append(data[0] + data[1])
40 -
41 -# tfidf_vectorizer = TfidfVectorizer()
42 -# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
43 -# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
44 -keyword_extractor = KeywordSummarizer(
45 - # tokenize=Okt_tokenizer,
46 - tokenize=komoran_tokenizer,
47 - window = -1,
48 - verbose= False
49 -)
50 -
51 -keywords = keyword_extractor.summarize(posts, topk=30)
52 -print(keywords)
...\ No newline at end of file ...\ No newline at end of file
23 +def GetKeywords():
24 + posts = []
25 + with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'r', encoding='utf-8') as db:
26 + reader = csv.reader(db)
27 + for data in reader:
28 + data[0] = re.sub(pattern='[^\w\s]', repl='', string=data[0]).replace('\n', '')
29 + data[1] = re.sub(pattern='[^\w\s]', repl='', string=data[1]).replace('\n', '')
30 + posts.append(data[0] + data[1])
31 +
32 + keyword_extractor = KeywordSummarizer(
33 + tokenize=komoran_tokenizer,
34 + window = -1,
35 + verbose= False
36 + )
37 +
38 + keywords = keyword_extractor.summarize(posts, topk=30)
39 + return keywords
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException ...@@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException
10 from bs4 import BeautifulSoup 10 from bs4 import BeautifulSoup
11 from datetime import datetime, timedelta 11 from datetime import datetime, timedelta
12 12
13 +BASE_DIR = os.path.dirname(os.path.realpath(__file__))
14 +
13 def sleeptime(): 15 def sleeptime():
14 time.sleep(random.randint(1, 3)) 16 time.sleep(random.randint(1, 3))
15 17
...@@ -18,83 +20,87 @@ def Click(xpath): ...@@ -18,83 +20,87 @@ def Click(xpath):
18 driver.execute_script("arguments[0].click();", element) 20 driver.execute_script("arguments[0].click();", element)
19 sleeptime() 21 sleeptime()
20 22
21 -login_info = { 23 +def GetData():
22 - 'userID' : 'id', 24 + login_info = {
23 - 'userpw' : '********' 25 + 'userID' : 'qdw0313',
24 - 26 + 'userpw' : 'fejUfrQxHWwtcGcP0'
25 -} 27 + }
26 - 28 +
27 -options = webdriver.ChromeOptions() 29 + options = webdriver.ChromeOptions()
28 -# options.add_argument('headless') 30 + options.add_argument('headless')
29 -options.add_argument('no-sandbox') 31 + options.add_argument('no-sandbox')
30 -options.add_argument('window-size=1920x1080') 32 + options.add_argument('window-size=1920x1080')
31 -options.add_argument('disable-gpu') 33 + options.add_argument('disable-gpu')
32 -options.add_argument('disable-dev-shm-usage') 34 + options.add_argument('disable-dev-shm-usage')
33 -options.add_argument('lang=ko_KR') 35 + options.add_argument('lang=ko_KR')
34 -options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') 36 + options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
35 - 37 +
36 -driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options) 38 + # driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
37 - 39 + driver = webdriver.Chrome(BASE_DIR + '/chromedriver.exe', options=options)
38 -driver.get('about:blank') 40 +
39 -driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") 41 + driver.get('about:blank')
40 -sleeptime() 42 + driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
41 -driver.get('https://everytime.kr/login') 43 + sleeptime()
42 - 44 + driver.get('https://everytime.kr/login')
43 -sleeptime() 45 +
44 -driver.find_element_by_name('userid').send_keys(login_info['userID']) 46 + sleeptime()
45 -driver.find_element_by_name('password').send_keys(login_info['userpw']) 47 + driver.find_element_by_name('userid').send_keys(login_info['userID'])
46 -driver.find_element_by_class_name('submit').click() 48 + driver.find_element_by_name('password').send_keys(login_info['userpw'])
47 -sleeptime() 49 + driver.find_element_by_class_name('submit').click()
48 - 50 + sleeptime()
49 -# 국제캠 자게 51 +
50 -sleeptime() 52 + # 국제캠 자게
51 -yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') 53 + sleeptime()
52 -print(yesterday) 54 + yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
53 -swt = True 55 + print(yesterday)
54 -page = 0 56 + swt = True
55 - 57 + page = 0
56 -post_df = pd.DataFrame(columns=['title', 'content']) 58 +
57 -while swt: 59 + post_df = pd.DataFrame(columns=['title', 'content'])
58 - if page < 1: 60 + while swt:
59 - Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') 61 + if page < 1:
60 - page += 1 62 + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
61 - else:
62 - if page == 1:
63 - Click('//*[@id="container"]/div[2]/div[2]/a')
64 - page += 1
65 - elif page == 2:
66 - Click('//*[@id="container"]/div[2]/div[2]/a[2]')
67 page += 1 63 page += 1
68 else: 64 else:
69 - Click('//*[@id="container"]/div[2]/div[2]/a[3]')
70 -
71 - html = driver.page_source
72 - soup = BeautifulSoup(html, 'html.parser')
73 -
74 - TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
75 - DateList = soup.select('#container > div.wrap.articles > article > a > time')
76 -
77 - for post in zip(TitleList, DateList):
78 - title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
79 - try:
80 - Click("//h2[contains(text(), '{}')]".format(title))
81 - except NoSuchElementException:
82 - continue
83 - content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
84 - driver.back()
85 - sleeptime()
86 -
87 - if not (post_df['title'] == title).any():
88 - # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
89 - content = re.sub(pattern='[^\w\s]', repl='', string=content)
90 - post_df = post_df.append(pd.DataFrame([[title, content]],
91 - columns=['title', 'content']))
92 - # print("{0}. {1} : {2}".format(idx, title, content))
93 - print(post[1].text)
94 - if post[1].text < yesterday:
95 - break
96 -
97 -post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
98 -print("CVS file saved")
99 -post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
100 -print("JSON file saved")
...\ No newline at end of file ...\ No newline at end of file
65 + if page == 1:
66 + Click('//*[@id="container"]/div[2]/div[2]/a')
67 + page += 1
68 + elif page == 2:
69 + Click('//*[@id="container"]/div[2]/div[2]/a[2]')
70 + page += 1
71 + else:
72 + Click('//*[@id="container"]/div[2]/div[2]/a[3]')
73 +
74 + html = driver.page_source
75 + soup = BeautifulSoup(html, 'html.parser')
76 +
77 + TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
78 + DateList = soup.select('#container > div.wrap.articles > article > a > time')
79 +
80 + for post in zip(TitleList, DateList):
81 + title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
82 + try:
83 + Click("//h2[contains(text(), '{}')]".format(title))
84 + except NoSuchElementException:
85 + continue
86 + content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
87 + driver.back()
88 + sleeptime()
89 +
90 + if not (post_df['title'] == title).any():
91 + # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
92 + content = re.sub(pattern='[^\w\s]', repl='', string=content)
93 + post_df = post_df.append(pd.DataFrame([[title, content]],
94 + columns=['title', 'content']))
95 + # print("{0}. {1} : {2}".format(idx, title, content))
96 + print(post[1].text)
97 + print(yesterday < "06/02 16:35")
98 + exit()
99 + if post[1].text <= yesterday:
100 + break
101 +
102 + post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False)
103 + print("CVS file saved")
104 + with open('data.json', 'w', encoding='utf-8') as file:
105 + post_df.to_json(file, force_ascii=False)
106 + print("JSON file saved")
...\ No newline at end of file ...\ No newline at end of file
......