박은주

수정: 전문 크롤링

...@@ -13,4 +13,10 @@ chromedriver.exe ...@@ -13,4 +13,10 @@ chromedriver.exe
13 /.idea/ 13 /.idea/
14 *.iml 14 *.iml
15 *.csv 15 *.csv
16 -*.xml
...\ No newline at end of file ...\ No newline at end of file
16 +*.xml
17 +*.json
18 +*.png
19 +*.org
20 +
21 +/KoreanSentimentAnalyzer/.gitignore
22 +/KoreanSentimentAnalyzer/.git/
...\ No newline at end of file ...\ No newline at end of file
......
...@@ -2,19 +2,25 @@ import csv ...@@ -2,19 +2,25 @@ import csv
2 import time 2 import time
3 import random 3 import random
4 import os 4 import os
5 +import re
6 +import pandas as pd
5 7
6 from selenium import webdriver 8 from selenium import webdriver
9 +from selenium.common.exceptions import NoSuchElementException
7 from bs4 import BeautifulSoup 10 from bs4 import BeautifulSoup
8 -from datetime import datetime, timedelta\ 11 +from datetime import datetime, timedelta
9 12
10 def sleeptime(): 13 def sleeptime():
11 - rand = random.uniform(1,3) 14 + time.sleep(random.randint(1, 3))
12 - time.sleep(rand)
13 15
16 +def Click(xpath):
17 + element = driver.find_element_by_xpath(xpath)
18 + driver.execute_script("arguments[0].click();", element)
19 + sleeptime()
14 20
15 login_info = { 21 login_info = {
16 - 'userID' : 'id', 22 + 'userID' : 'qdw0313',
17 - 'userpw' : 'passwd' 23 + 'userpw' : 'fejUfrQxHWwtcGcP0'
18 } 24 }
19 25
20 options = webdriver.ChromeOptions() 26 options = webdriver.ChromeOptions()
...@@ -41,44 +47,57 @@ sleeptime() ...@@ -41,44 +47,57 @@ sleeptime()
41 47
42 # 국제캠 자게 48 # 국제캠 자게
43 sleeptime() 49 sleeptime()
44 -posts = []
45 yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') 50 yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
51 +print(yesterday)
46 swt = True 52 swt = True
47 -page = 1 53 +page = 0
48 54
55 +post_df = pd.DataFrame(columns=['title', 'content'])
49 while swt: 56 while swt:
50 - if not posts: 57 + if page < 1:
51 - driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click() 58 + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
59 + page += 1
52 else: 60 else:
53 if page == 1: 61 if page == 1:
54 - driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click() 62 + Click('//*[@id="container"]/div[2]/div[2]/a')
55 page += 1 63 page += 1
56 elif page == 2: 64 elif page == 2:
57 - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]') 65 + Click('//*[@id="container"]/div[2]/div[2]/a[2]')
58 - driver.execute_script("arguments[0].click();", element)
59 - sleeptime()
60 page += 1 66 page += 1
61 else: 67 else:
62 - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]') 68 + Click('//*[@id="container"]/div[2]/div[2]/a[3]')
63 - driver.execute_script("arguments[0].click();", element)
64 - sleeptime()
65 69
66 - sleeptime()
67 html = driver.page_source 70 html = driver.page_source
68 soup = BeautifulSoup(html, 'html.parser') 71 soup = BeautifulSoup(html, 'html.parser')
69 72
70 TitleList = soup.select('#container > div.wrap.articles > article > a > h2') 73 TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
71 - ContentList = soup.select('#container > div.wrap.articles > article > a > p')
72 DateList = soup.select('#container > div.wrap.articles > article > a > time') 74 DateList = soup.select('#container > div.wrap.articles > article > a > time')
75 + # ContentList = soup.select('#container > div.wrap.articles > article > a > p')
76 +
77 + # idx = 1
78 + # for post in zip(TitleList, ContentList, DateList):
79 + for post in zip(TitleList, DateList):
80 + title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
81 + try:
82 + Click("//h2[contains(text(), '{}')]".format(title))
83 + except NoSuchElementException:
84 + continue
85 + content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
86 + driver.back()
87 + sleeptime()
73 88
74 - for post in zip(TitleList, ContentList, DateList): 89 + if not (post_df['title'] == title).any():
75 - posts.append([post[0].text, post[1].text]) 90 + # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
76 - if post[2].text == yesterday: 91 + content = re.sub(pattern='[^\w\s]', repl='', string=content)
92 + post_df = post_df.append(pd.DataFrame([[title, content]],
93 + columns=['title', 'content']))
94 + # print("{0}. {1} : {2}".format(idx, title, content))
95 + print(post[1].text)
96 + if post[1].text < yesterday:
77 swt = False 97 swt = False
78 break 98 break
79 99
80 -BASE_DIR = os.path.dirname(os.path.abspath(__file__))
81 -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
82 - writer = csv.writer(file)
83 - for idx in range(len(posts)):
84 - writer.writerow(posts[idx])
...\ No newline at end of file ...\ No newline at end of file
100 +post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
101 +print("CVS file saved")
102 +post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
103 +print("JSON file saved")
...\ No newline at end of file ...\ No newline at end of file
......