수정: 전문 크롤링

박은주
Commit 0bf5691f99c0245be44981583820c032e1767d8f 0bf5691f 1 parent b29f9e71
Showing 2 changed files with 52 additions and 27 deletions
.gitignore
content.py
--- a/.gitignore
View file @0bf5691
+++ b/.gitignore
View file @0bf5691
@@ -13,4 +13,10 @@ chromedriver.exe
 /.idea/
 *.iml
 *.csv
- *.xml
\ No newline at end of file
+ *.xml
+ *.json
+ *.png
+ *.org
+ 
+ /KoreanSentimentAnalyzer/.gitignore
+ /KoreanSentimentAnalyzer/.git/
\ No newline at end of file
--- a/content.py
View file @0bf5691
+++ b/content.py
View file @0bf5691
@@ -2,19 +2,25 @@ import csv
 import time
 import random
 import os
+ import re
+ import pandas as pd
 
 from selenium import webdriver
+ from selenium.common.exceptions import NoSuchElementException
 from bs4 import BeautifulSoup
- from datetime import datetime, timedelta\
+ from datetime import datetime, timedelta
 
 def sleeptime():
-     rand = random.uniform(1,3)
-     time.sleep(rand)
+     time.sleep(random.randint(1, 3))
 
+ def Click(xpath):
+     element = driver.find_element_by_xpath(xpath)
+     driver.execute_script("arguments[0].click();", element)
+     sleeptime()
 
 login_info = {
-     'userID' : 'id',
-     'userpw' : 'passwd'
+     'userID' : 'qdw0313',
+     'userpw' : 'fejUfrQxHWwtcGcP0'
 }
 
 options = webdriver.ChromeOptions()
@@ -41,44 +47,57 @@ sleeptime()
 
 # 국제캠 자게
 sleeptime()
- posts = []
 yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
+ print(yesterday)
 swt = True
- page = 1
+ page = 0
 
+ post_df = pd.DataFrame(columns=['title', 'content'])
 while swt:
-     if not posts:
-         driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click()
+     if page < 1:
+         Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
+         page += 1
     else:
         if page == 1:
-             driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click()
+             Click('//*[@id="container"]/div[2]/div[2]/a')
             page += 1
         elif page == 2:
-             element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]')
-             driver.execute_script("arguments[0].click();", element)
-             sleeptime()
+             Click('//*[@id="container"]/div[2]/div[2]/a[2]')
             page += 1
         else:
-             element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]')
-             driver.execute_script("arguments[0].click();", element)
-             sleeptime()
+             Click('//*[@id="container"]/div[2]/div[2]/a[3]')
 
-     sleeptime()
     html = driver.page_source
     soup = BeautifulSoup(html, 'html.parser')
 
     TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
-     ContentList = soup.select('#container > div.wrap.articles > article > a > p')
     DateList = soup.select('#container > div.wrap.articles > article > a > time')
+     # ContentList = soup.select('#container > div.wrap.articles > article > a > p')
+ 
+     # idx = 1
+     # for post in zip(TitleList, ContentList, DateList):
+     for post in zip(TitleList, DateList):
+         title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
+         try:
+             Click("//h2[contains(text(), '{}')]".format(title))
+         except NoSuchElementException:
+             continue
+         content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
+         driver.back()
+         sleeptime()
 
-     for post in zip(TitleList, ContentList, DateList):
-         posts.append([post[0].text, post[1].text])
-         if post[2].text == yesterday:
+         if not (post_df['title'] == title).any():
+             # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
+             content = re.sub(pattern='[^\w\s]', repl='', string=content)
+             post_df = post_df.append(pd.DataFrame([[title, content]],
+                                                   columns=['title', 'content']))
+             # print("{0}. {1} : {2}".format(idx, title, content))
+         print(post[1].text)
+         if post[1].text < yesterday:
             swt = False
             break
 
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
- with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
-     writer = csv.writer(file)
-     for idx in range(len(posts)):
-         writer.writerow(posts[idx])
\ No newline at end of file
+ post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
+ print("CVS file saved")
+ post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
+ print("JSON file saved")
\ No newline at end of file