Showing
1 changed file
with
27 additions
and
13 deletions
1 | import csv | 1 | import csv |
2 | +import json | ||
2 | import time | 3 | import time |
3 | import random | 4 | import random |
4 | import os | 5 | import os |
5 | import re | 6 | import re |
6 | import pandas as pd | 7 | import pandas as pd |
8 | +import numpy as np | ||
7 | 9 | ||
8 | from selenium import webdriver | 10 | from selenium import webdriver |
9 | from selenium.common.exceptions import NoSuchElementException | 11 | from selenium.common.exceptions import NoSuchElementException |
... | @@ -15,15 +17,15 @@ BASE_DIR = os.path.dirname(os.path.realpath(__file__)) | ... | @@ -15,15 +17,15 @@ BASE_DIR = os.path.dirname(os.path.realpath(__file__)) |
15 | def sleeptime(): | 17 | def sleeptime(): |
16 | time.sleep(random.randint(1, 3)) | 18 | time.sleep(random.randint(1, 3)) |
17 | 19 | ||
18 | -def Click(xpath): | 20 | +def Click(xpath, driver): |
19 | element = driver.find_element_by_xpath(xpath) | 21 | element = driver.find_element_by_xpath(xpath) |
20 | driver.execute_script("arguments[0].click();", element) | 22 | driver.execute_script("arguments[0].click();", element) |
21 | sleeptime() | 23 | sleeptime() |
22 | 24 | ||
23 | def GetData(): | 25 | def GetData(): |
24 | login_info = { | 26 | login_info = { |
25 | - 'userID' : 'qdw0313', | 27 | + 'userID' : '********', |
26 | - 'userpw' : 'fejUfrQxHWwtcGcP0' | 28 | + 'userpw' : '********' |
27 | } | 29 | } |
28 | 30 | ||
29 | options = webdriver.ChromeOptions() | 31 | options = webdriver.ChromeOptions() |
... | @@ -59,17 +61,17 @@ def GetData(): | ... | @@ -59,17 +61,17 @@ def GetData(): |
59 | post_df = pd.DataFrame(columns=['title', 'content']) | 61 | post_df = pd.DataFrame(columns=['title', 'content']) |
60 | while swt: | 62 | while swt: |
61 | if page < 1: | 63 | if page < 1: |
62 | - Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') | 64 | + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a', driver) |
63 | page += 1 | 65 | page += 1 |
64 | else: | 66 | else: |
65 | if page == 1: | 67 | if page == 1: |
66 | - Click('//*[@id="container"]/div[2]/div[2]/a') | 68 | + Click('//*[@id="container"]/div[2]/div[2]/a', driver) |
67 | page += 1 | 69 | page += 1 |
68 | elif page == 2: | 70 | elif page == 2: |
69 | - Click('//*[@id="container"]/div[2]/div[2]/a[2]') | 71 | + Click('//*[@id="container"]/div[2]/div[2]/a[2]', driver) |
70 | page += 1 | 72 | page += 1 |
71 | else: | 73 | else: |
72 | - Click('//*[@id="container"]/div[2]/div[2]/a[3]') | 74 | + Click('//*[@id="container"]/div[2]/div[2]/a[3]', driver) |
73 | 75 | ||
74 | html = driver.page_source | 76 | html = driver.page_source |
75 | soup = BeautifulSoup(html, 'html.parser') | 77 | soup = BeautifulSoup(html, 'html.parser') |
... | @@ -80,7 +82,7 @@ def GetData(): | ... | @@ -80,7 +82,7 @@ def GetData(): |
80 | for post in zip(TitleList, DateList): | 82 | for post in zip(TitleList, DateList): |
81 | title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) | 83 | title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) |
82 | try: | 84 | try: |
83 | - Click("//h2[contains(text(), '{}')]".format(title)) | 85 | + Click("//h2[contains(text(), '{}')]".format(title), driver) |
84 | except NoSuchElementException: | 86 | except NoSuchElementException: |
85 | continue | 87 | continue |
86 | content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text | 88 | content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text |
... | @@ -90,17 +92,29 @@ def GetData(): | ... | @@ -90,17 +92,29 @@ def GetData(): |
90 | if not (post_df['title'] == title).any(): | 92 | if not (post_df['title'] == title).any(): |
91 | # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) | 93 | # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) |
92 | content = re.sub(pattern='[^\w\s]', repl='', string=content) | 94 | content = re.sub(pattern='[^\w\s]', repl='', string=content) |
95 | + content = re.sub(pattern='\n', repl=' ', string=content) | ||
93 | post_df = post_df.append(pd.DataFrame([[title, content]], | 96 | post_df = post_df.append(pd.DataFrame([[title, content]], |
94 | columns=['title', 'content'])) | 97 | columns=['title', 'content'])) |
95 | # print("{0}. {1} : {2}".format(idx, title, content)) | 98 | # print("{0}. {1} : {2}".format(idx, title, content)) |
96 | - print(post[1].text) | ||
97 | - print(yesterday < "06/02 16:35") | ||
98 | - exit() | ||
99 | if post[1].text <= yesterday: | 99 | if post[1].text <= yesterday: |
100 | break | 100 | break |
101 | + break | ||
101 | 102 | ||
102 | post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False) | 103 | post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False) |
103 | print("CVS file saved") | 104 | print("CVS file saved") |
104 | - with open('data.json', 'w', encoding='utf-8') as file: | 105 | + # print(post_df) |
105 | - post_df.to_json(file, force_ascii=False) | 106 | + # exit() |
107 | + # post_df.reset_index(drop=True, inplace=True) | ||
108 | + # post_df.to_json('data.json') | ||
109 | + # # with open('data.json', 'w', encoding='utf-8-sig') as file: | ||
110 | + # # post_df.to_json(file, force_ascii=False) | ||
111 | + | ||
112 | + with open('data.json', 'w+', encoding='utf-8-sig') as json_file: | ||
113 | + for post in zip(post_df['title'].tolist(), post_df['content'].tolist()): | ||
114 | + json.dump(post[0] + post[1], json_file, ensure_ascii=False) | ||
106 | print("JSON file saved") | 115 | print("JSON file saved") |
116 | + | ||
117 | +GetData() | ||
118 | +######## TODO: JSON으로 저장 | ||
119 | +######## 형식 : { "document" : { "type" : "PLAIN_TEXT", "content" : "~~" }, "encodingType" : "UTF8" } | ||
120 | +######## GOOGLE Sentiment Analyzer 사용을 위해 | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
-
Please register or login to post a comment