Showing
2 changed files
with
52 additions
and
27 deletions
... | @@ -13,4 +13,10 @@ chromedriver.exe | ... | @@ -13,4 +13,10 @@ chromedriver.exe |
13 | /.idea/ | 13 | /.idea/ |
14 | *.iml | 14 | *.iml |
15 | *.csv | 15 | *.csv |
16 | -*.xml | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
16 | +*.xml | ||
17 | +*.json | ||
18 | +*.png | ||
19 | +*.org | ||
20 | + | ||
21 | +/KoreanSentimentAnalyzer/.gitignore | ||
22 | +/KoreanSentimentAnalyzer/.git/ | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
... | @@ -2,19 +2,25 @@ import csv | ... | @@ -2,19 +2,25 @@ import csv |
2 | import time | 2 | import time |
3 | import random | 3 | import random |
4 | import os | 4 | import os |
5 | +import re | ||
6 | +import pandas as pd | ||
5 | 7 | ||
6 | from selenium import webdriver | 8 | from selenium import webdriver |
9 | +from selenium.common.exceptions import NoSuchElementException | ||
7 | from bs4 import BeautifulSoup | 10 | from bs4 import BeautifulSoup |
8 | -from datetime import datetime, timedelta\ | 11 | +from datetime import datetime, timedelta |
9 | 12 | ||
10 | def sleeptime(): | 13 | def sleeptime(): |
11 | - rand = random.uniform(1,3) | 14 | + time.sleep(random.randint(1, 3)) |
12 | - time.sleep(rand) | ||
13 | 15 | ||
16 | +def Click(xpath): | ||
17 | + element = driver.find_element_by_xpath(xpath) | ||
18 | + driver.execute_script("arguments[0].click();", element) | ||
19 | + sleeptime() | ||
14 | 20 | ||
15 | login_info = { | 21 | login_info = { |
16 | - 'userID' : 'id', | 22 | + 'userID' : 'qdw0313', |
17 | - 'userpw' : 'passwd' | 23 | + 'userpw' : 'fejUfrQxHWwtcGcP0' |
18 | } | 24 | } |
19 | 25 | ||
20 | options = webdriver.ChromeOptions() | 26 | options = webdriver.ChromeOptions() |
... | @@ -41,44 +47,57 @@ sleeptime() | ... | @@ -41,44 +47,57 @@ sleeptime() |
41 | 47 | ||
42 | # 국제캠 자게 | 48 | # 국제캠 자게 |
43 | sleeptime() | 49 | sleeptime() |
44 | -posts = [] | ||
45 | yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') | 50 | yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M') |
51 | +print(yesterday) | ||
46 | swt = True | 52 | swt = True |
47 | -page = 1 | 53 | +page = 0 |
48 | 54 | ||
55 | +post_df = pd.DataFrame(columns=['title', 'content']) | ||
49 | while swt: | 56 | while swt: |
50 | - if not posts: | 57 | + if page < 1: |
51 | - driver.find_element_by_xpath('//*[@id="submenu"]/div/div[2]/ul/li[1]/a').click() | 58 | + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') |
59 | + page += 1 | ||
52 | else: | 60 | else: |
53 | if page == 1: | 61 | if page == 1: |
54 | - driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a').click() | 62 | + Click('//*[@id="container"]/div[2]/div[2]/a') |
55 | page += 1 | 63 | page += 1 |
56 | elif page == 2: | 64 | elif page == 2: |
57 | - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[2]') | 65 | + Click('//*[@id="container"]/div[2]/div[2]/a[2]') |
58 | - driver.execute_script("arguments[0].click();", element) | ||
59 | - sleeptime() | ||
60 | page += 1 | 66 | page += 1 |
61 | else: | 67 | else: |
62 | - element = driver.find_element_by_xpath('//*[@id="container"]/div[2]/div[2]/a[3]') | 68 | + Click('//*[@id="container"]/div[2]/div[2]/a[3]') |
63 | - driver.execute_script("arguments[0].click();", element) | ||
64 | - sleeptime() | ||
65 | 69 | ||
66 | - sleeptime() | ||
67 | html = driver.page_source | 70 | html = driver.page_source |
68 | soup = BeautifulSoup(html, 'html.parser') | 71 | soup = BeautifulSoup(html, 'html.parser') |
69 | 72 | ||
70 | TitleList = soup.select('#container > div.wrap.articles > article > a > h2') | 73 | TitleList = soup.select('#container > div.wrap.articles > article > a > h2') |
71 | - ContentList = soup.select('#container > div.wrap.articles > article > a > p') | ||
72 | DateList = soup.select('#container > div.wrap.articles > article > a > time') | 74 | DateList = soup.select('#container > div.wrap.articles > article > a > time') |
75 | + # ContentList = soup.select('#container > div.wrap.articles > article > a > p') | ||
76 | + | ||
77 | + # idx = 1 | ||
78 | + # for post in zip(TitleList, ContentList, DateList): | ||
79 | + for post in zip(TitleList, DateList): | ||
80 | + title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) | ||
81 | + try: | ||
82 | + Click("//h2[contains(text(), '{}')]".format(title)) | ||
83 | + except NoSuchElementException: | ||
84 | + continue | ||
85 | + content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text | ||
86 | + driver.back() | ||
87 | + sleeptime() | ||
73 | 88 | ||
74 | - for post in zip(TitleList, ContentList, DateList): | 89 | + if not (post_df['title'] == title).any(): |
75 | - posts.append([post[0].text, post[1].text]) | 90 | + # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) |
76 | - if post[2].text == yesterday: | 91 | + content = re.sub(pattern='[^\w\s]', repl='', string=content) |
92 | + post_df = post_df.append(pd.DataFrame([[title, content]], | ||
93 | + columns=['title', 'content'])) | ||
94 | + # print("{0}. {1} : {2}".format(idx, title, content)) | ||
95 | + print(post[1].text) | ||
96 | + if post[1].text < yesterday: | ||
77 | swt = False | 97 | swt = False |
78 | break | 98 | break |
79 | 99 | ||
80 | -BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | ||
81 | -with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file: | ||
82 | - writer = csv.writer(file) | ||
83 | - for idx in range(len(posts)): | ||
84 | - writer.writerow(posts[idx]) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
100 | +post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig') | ||
101 | +print("CVS file saved") | ||
102 | +post_df.to_json('data.json', orient='records', encoding='utf-8-sig') | ||
103 | +print("JSON file saved") | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment