수정: 전문 크롤링

박은주
Commit 0bf5691f99c0245be44981583820c032e1767d8f 0bf5691f 1 parent b29f9e71
Showing 2 changed files with 52 additions and 27 deletions
.gitignore
content.py
--- a/.gitignore
View file @0bf5691
+++ b/.gitignore
View file @0bf5691
@@ -13,4 +13,10 @@ chromedriver.exe
 /.idea/
 *.iml
 *.csv
-*.xml
\ No newline at end of file
+*.xml
+*.json
+*.png
+*.org
+
+/KoreanSentimentAnalyzer/.gitignore
+/KoreanSentimentAnalyzer/.git/
\ No newline at end of file
--- a/content.py
View file @0bf5691
+++ b/content.py
View file @0bf5691
@@ -2,19 +2,25 @@ import csv
 import time
 import random
 import os
+import re
+import pandas as pd
 from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
 from bs4 import BeautifulSoup
-from datetime import datetime, timedelta\
+from datetime import datetime, timedelta
 def sleeptime():
-    rand = random.uniform(1,3)
+    time.sleep(random.randint(1, 3))
-    time.sleep(rand)
+def Click(xpath):
+    element = driver.find_element_by_xpath(xpath)
+    driver.execute_script("arguments[0].click();", element)
+    sleeptime()
 login_info = {
-    'userID' : 'id',
+    'userID' : 'qdw0313',
-    'userpw' : 'passwd'
+    'userpw' : 'fejUfrQxHWwtcGcP0'
 }
 options = webdriver.ChromeOptions()
@@ -41,44 +47,57 @@ sleeptime()
 # 국제캠 자게
 sleeptime()
-posts = []
 yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
+print(yesterday)
 swt = True
-page = 1
+page = 0
+post_df = pd.DataFrame(columns=['title', 'content'])
 while swt:
-    if not posts:
+    if page < 1:
-        driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click()
+        Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
+        page += 1
     else:
         if page == 1:
-            driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click()
+            Click('//*[@id="container"]/div[2]/div[2]/a')
             page += 1
         elif page == 2:
-            element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]')
+            Click('//*[@id="container"]/div[2]/div[2]/a[2]')
-            driver.execute_script("arguments[0].click();", element)
-            sleeptime()
             page += 1
         else:
-            element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]')
+            Click('//*[@id="container"]/div[2]/div[2]/a[3]')
-            driver.execute_script("arguments[0].click();", element)
-            sleeptime()
-    sleeptime()
     html = driver.page_source
     soup = BeautifulSoup(html, 'html.parser')
     TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
-    ContentList = soup.select('#container > div.wrap.articles > article > a > p')
     DateList = soup.select('#container > div.wrap.articles > article > a > time')
+    # ContentList = soup.select('#container > div.wrap.articles > article > a > p')
+
+    # idx = 1
+    # for post in zip(TitleList, ContentList, DateList):
+    for post in zip(TitleList, DateList):
+        title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
+        try:
+            Click("//h2[contains(text(), '{}')]".format(title))
+        except NoSuchElementException:
+            continue
+        content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
+        driver.back()
+        sleeptime()
-    for post in zip(TitleList, ContentList, DateList):
+        if not (post_df['title'] == title).any():
-        posts.append([post[0].text, post[1].text])
+            # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
-        if post[2].text == yesterday:
+            content = re.sub(pattern='[^\w\s]', repl='', string=content)
+            post_df = post_df.append(pd.DataFrame([[title, content]],
+                                                  columns=['title', 'content']))
+            # print("{0}. {1} : {2}".format(idx, title, content))
+        print(post[1].text)
+        if post[1].text < yesterday:
             swt = False
             break
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
-    writer = csv.writer(file)
-    for idx in range(len(posts)):
-        writer.writerow(posts[idx])
\ No newline at end of file
+post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
+print("CVS file saved")
+post_df.to_json('data.json', orient='records', encoding='utf-8-sig')
+print("JSON file saved")
\ No newline at end of file