박은주

수정: 전문 크롤링

......@@ -13,4 +13,10 @@ chromedriver.exe
/.idea/
*.iml
*.csv
*.xml
\ No newline at end of file
*.xml
*.json
*.png
*.org
/KoreanSentimentAnalyzer/.gitignore
/KoreanSentimentAnalyzer/.git/
\ No newline at end of file
......
......@@ -2,19 +2,25 @@ import csv
import time
import random
import os
import re
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from datetime import datetime, timedelta\
from datetime import datetime, timedelta
def sleeptime():
rand = random.uniform(1,3)
time.sleep(rand)
time.sleep(random.randint(1, 3))
def Click(xpath):
element = driver.find_element_by_xpath(xpath)
driver.execute_script("arguments[0].click();", element)
sleeptime()
login_info = {
'userID' : 'id',
'userpw' : 'passwd'
'userID' : 'qdw0313',
'userpw' : 'fejUfrQxHWwtcGcP0'
}
options = webdriver.ChromeOptions()
......@@ -41,44 +47,57 @@ sleeptime()
# 국제캠 자게
sleeptime()
posts = []
yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
print(yesterday)
swt = True
page = 1
page = 0
post_df = pd.DataFrame(columns=['title', 'content'])
while swt:
if not posts:
driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click()
if page < 1:
Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
page += 1
else:
if page == 1:
driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click()
Click('//*[@id="container"]/div[2]/div[2]/a')
page += 1
elif page == 2:
element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]')
driver.execute_script("arguments[0].click();", element)
sleeptime()
Click('//*[@id="container"]/div[2]/div[2]/a[2]')
page += 1
else:
element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]')
driver.execute_script("arguments[0].click();", element)
sleeptime()
Click('//*[@id="container"]/div[2]/div[2]/a[3]')
sleeptime()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
ContentList = soup.select('#container > div.wrap.articles > article > a > p')
DateList = soup.select('#container > div.wrap.articles > article > a > time')
# ContentList = soup.select('#container > div.wrap.articles > article > a > p')
# idx = 1
# for post in zip(TitleList, ContentList, DateList):
for post in zip(TitleList, DateList):
title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
try:
Click("//h2[contains(text(), '{}')]".format(title))
except NoSuchElementException:
continue
content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
driver.back()
sleeptime()
for post in zip(TitleList, ContentList, DateList):
posts.append([post[0].text, post[1].text])
if post[2].text == yesterday:
if not (post_df['title'] == title).any():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content = re.sub(pattern='[^\w\s]', repl='', string=content)
post_df = post_df.append(pd.DataFrame([[title, content]],
columns=['title', 'content']))
# print("{0}. {1} : {2}".format(idx, title, content))
print(post[1].text)
if post[1].text < yesterday:
swt = False
break
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
writer = csv.writer(file)
for idx in range(len(posts)):
writer.writerow(posts[idx])
\ No newline at end of file
post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
print("CVS file saved")
post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
print("JSON file saved")
\ No newline at end of file
......