Showing
1 changed file
with
29 additions
and
20 deletions
... | @@ -2,6 +2,7 @@ import csv | ... | @@ -2,6 +2,7 @@ import csv |
2 | import time | 2 | import time |
3 | import random | 3 | import random |
4 | import os | 4 | import os |
5 | +import pandas as pd | ||
5 | 6 | ||
6 | from selenium import webdriver | 7 | from selenium import webdriver |
7 | from bs4 import BeautifulSoup | 8 | from bs4 import BeautifulSoup |
... | @@ -11,14 +12,18 @@ def sleeptime(): | ... | @@ -11,14 +12,18 @@ def sleeptime(): |
11 | rand = random.uniform(1,3) | 12 | rand = random.uniform(1,3) |
12 | time.sleep(rand) | 13 | time.sleep(rand) |
13 | 14 | ||
15 | +def Click(xpath): | ||
16 | + element = driver.find_element_by_xpath(xpath) | ||
17 | + driver.execute_script("arguments[0].click();", element) | ||
18 | + sleeptime() | ||
14 | 19 | ||
15 | login_info = { | 20 | login_info = { |
16 | 'userID' : 'id', | 21 | 'userID' : 'id', |
17 | - 'userpw' : 'passwd' | 22 | + 'userpw' : 'pw' |
18 | } | 23 | } |
19 | 24 | ||
20 | options = webdriver.ChromeOptions() | 25 | options = webdriver.ChromeOptions() |
21 | -options.add_argument('headless') | 26 | +# options.add_argument('headless') |
22 | options.add_argument('no-sandbox') | 27 | options.add_argument('no-sandbox') |
23 | options.add_argument('window-size=1920x1080') | 28 | options.add_argument('window-size=1920x1080') |
24 | options.add_argument('disable-gpu') | 29 | options.add_argument('disable-gpu') |
... | @@ -26,7 +31,7 @@ options.add_argument('disable-dev-shm-usage') | ... | @@ -26,7 +31,7 @@ options.add_argument('disable-dev-shm-usage') |
26 | options.add_argument('lang=ko_KR') | 31 | options.add_argument('lang=ko_KR') |
27 | options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') | 32 | options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47') |
28 | 33 | ||
29 | -driver = webdriver.Chrome(r'C:\Users\Admin\Desktop\OSS\Todays_Issue\chromedriver.exe', options=options) | 34 | +driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options) |
30 | 35 | ||
31 | driver.get('about:blank') | 36 | driver.get('about:blank') |
32 | driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") | 37 | driver.execute_script("Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});") |
... | @@ -46,39 +51,43 @@ yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') | ... | @@ -46,39 +51,43 @@ yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') |
46 | swt = True | 51 | swt = True |
47 | page = 1 | 52 | page = 1 |
48 | 53 | ||
54 | +post_df = pd.DataFrame(columns=['title', 'content']) | ||
49 | while swt: | 55 | while swt: |
50 | if not posts: | 56 | if not posts: |
51 | - driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click() | 57 | + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') |
52 | else: | 58 | else: |
53 | if page == 1: | 59 | if page == 1: |
54 | - driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click() | 60 | + Click('//*[@id="container"]/div[2]/div[2]/a') |
55 | page += 1 | 61 | page += 1 |
56 | elif page == 2: | 62 | elif page == 2: |
57 | - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]') | 63 | + Click('//*[@id="container"]/div[2]/div[2]/a[2]') |
58 | - driver.execute_script("arguments[0].click();", element) | ||
59 | - sleeptime() | ||
60 | page += 1 | 64 | page += 1 |
61 | else: | 65 | else: |
62 | - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]') | 66 | + Click('//*[@id="container"]/div[2]/div[2]/a[3]') |
63 | - driver.execute_script("arguments[0].click();", element) | ||
64 | - sleeptime() | ||
65 | 67 | ||
66 | - sleeptime() | ||
67 | html = driver.page_source | 68 | html = driver.page_source |
68 | soup = BeautifulSoup(html, 'html.parser') | 69 | soup = BeautifulSoup(html, 'html.parser') |
69 | 70 | ||
70 | TitleList = soup.select('#container > div.wrap.articles > article > a > h2') | 71 | TitleList = soup.select('#container > div.wrap.articles > article > a > h2') |
71 | - ContentList = soup.select('#container > div.wrap.articles > article > a > p') | ||
72 | DateList = soup.select('#container > div.wrap.articles > article > a > time') | 72 | DateList = soup.select('#container > div.wrap.articles > article > a > time') |
73 | + ContentList = soup.select('#container > div.wrap.articles > article > a > p') | ||
73 | 74 | ||
75 | + idx = 1 | ||
74 | for post in zip(TitleList, ContentList, DateList): | 76 | for post in zip(TitleList, ContentList, DateList): |
75 | - posts.append([post[0].text, post[1].text]) | 77 | + Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) |
76 | - if post[2].text == yesterday: | 78 | + content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text |
79 | + sleeptime() | ||
80 | + idx += 1 | ||
81 | + | ||
82 | + post_df = post_df.append(pd.DataFrame([post[0].text, content], | ||
83 | + columns=['title', 'content'])) | ||
84 | + if post[2].text < yesterday: | ||
77 | swt = False | 85 | swt = False |
78 | break | 86 | break |
79 | 87 | ||
80 | -BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | ||
81 | -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file: | ||
82 | - writer = csv.writer(file) | ||
83 | - for idx in range(len(posts)): | ||
84 | - writer.writerow(posts[idx]) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
88 | +post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig') | ||
89 | +# BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | ||
90 | +# with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file: | ||
91 | +# writer = csv.writer(file) | ||
92 | +# for idx in range(len(posts)): | ||
93 | +# writer.writerow(posts[idx]) | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment