박은주

Add file via upload

Showing 1 changed file with 29 additions and 20 deletions
...@@ -2,6 +2,7 @@ import csv ...@@ -2,6 +2,7 @@ import csv
2 import time 2 import time
3 import random 3 import random
4 import os 4 import os
5 +import pandas as pd
5 6
6 from selenium import webdriver 7 from selenium import webdriver
7 from bs4 import BeautifulSoup 8 from bs4 import BeautifulSoup
...@@ -11,14 +12,18 @@ def sleeptime(): ...@@ -11,14 +12,18 @@ def sleeptime():
11 rand = random.uniform(1,3) 12 rand = random.uniform(1,3)
12 time.sleep(rand) 13 time.sleep(rand)
13 14
15 +def Click(xpath):
16 + element = driver.find_element_by_xpath(xpath)
17 + driver.execute_script("arguments[0].click();", element)
18 + sleeptime()
14 19
15 login_info = { 20 login_info = {
16 'userID' : 'id', 21 'userID' : 'id',
17 - 'userpw' : 'passwd' 22 + 'userpw' : 'pw'
18 } 23 }
19 24
20 options = webdriver.ChromeOptions() 25 options = webdriver.ChromeOptions()
21 -options.add_argument('headless') 26 +# options.add_argument('headless')
22 options.add_argument('no-sandbox') 27 options.add_argument('no-sandbox')
23 options.add_argument('window-size=1920x1080') 28 options.add_argument('window-size=1920x1080')
24 options.add_argument('disable-gpu') 29 options.add_argument('disable-gpu')
...@@ -26,7 +31,7 @@ options.add_argument('disable-dev-shm-usage') ...@@ -26,7 +31,7 @@ options.add_argument('disable-dev-shm-usage')
26 options.add_argument('lang=ko_KR') 31 options.add_argument('lang=ko_KR')
27 options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') 32 options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47')
28 33
29 -driver = webdriver.Chrome(r'C:\Users\Admin\Desktop\OSS\Todays_Issue\chromedriver.exe', options=options) 34 +driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
30 35
31 driver.get('about:blank') 36 driver.get('about:blank')
32 driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") 37 driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});")
...@@ -46,39 +51,43 @@ yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') ...@@ -46,39 +51,43 @@ yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
46 swt = True 51 swt = True
47 page = 1 52 page = 1
48 53
54 +post_df = pd.DataFrame(columns=['title', 'content'])
49 while swt: 55 while swt:
50 if not posts: 56 if not posts:
51 - driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click() 57 + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
52 else: 58 else:
53 if page == 1: 59 if page == 1:
54 - driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click() 60 + Click('//*[@id="container"]/div[2]/div[2]/a')
55 page += 1 61 page += 1
56 elif page == 2: 62 elif page == 2:
57 - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]') 63 + Click('//*[@id="container"]/div[2]/div[2]/a[2]')
58 - driver.execute_script("arguments[0].click();", element)
59 - sleeptime()
60 page += 1 64 page += 1
61 else: 65 else:
62 - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]') 66 + Click('//*[@id="container"]/div[2]/div[2]/a[3]')
63 - driver.execute_script("arguments[0].click();", element)
64 - sleeptime()
65 67
66 - sleeptime()
67 html = driver.page_source 68 html = driver.page_source
68 soup = BeautifulSoup(html, 'html.parser') 69 soup = BeautifulSoup(html, 'html.parser')
69 70
70 TitleList = soup.select('#container > div.wrap.articles > article > a > h2') 71 TitleList = soup.select('#container > div.wrap.articles > article > a > h2')
71 - ContentList = soup.select('#container > div.wrap.articles > article > a > p')
72 DateList = soup.select('#container > div.wrap.articles > article > a > time') 72 DateList = soup.select('#container > div.wrap.articles > article > a > time')
73 + ContentList = soup.select('#container > div.wrap.articles > article > a > p')
73 74
75 + idx = 1
74 for post in zip(TitleList, ContentList, DateList): 76 for post in zip(TitleList, ContentList, DateList):
75 - posts.append([post[0].text, post[1].text]) 77 + Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
76 - if post[2].text == yesterday: 78 + content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
79 + sleeptime()
80 + idx += 1
81 +
82 + post_df = post_df.append(pd.DataFrame([post[0].text, content],
83 + columns=['title', 'content']))
84 + if post[2].text < yesterday:
77 swt = False 85 swt = False
78 break 86 break
79 87
80 -BASE_DIR = os.path.dirname(os.path.abspath(__file__))
81 -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
82 - writer = csv.writer(file)
83 - for idx in range(len(posts)):
84 - writer.writerow(posts[idx])
...\ No newline at end of file ...\ No newline at end of file
88 +post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig')
89 +# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
90 +# with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
91 +# writer = csv.writer(file)
92 +# for idx in range(len(posts)):
93 +# writer.writerow(posts[idx])
...\ No newline at end of file ...\ No newline at end of file
......