Showing
1 changed file
with
23 additions
and
6 deletions
... | @@ -9,6 +9,7 @@ import numpy as np | ... | @@ -9,6 +9,7 @@ import numpy as np |
9 | 9 | ||
10 | from selenium import webdriver | 10 | from selenium import webdriver |
11 | from selenium.common.exceptions import NoSuchElementException | 11 | from selenium.common.exceptions import NoSuchElementException |
12 | +from hanspell import spell_checker | ||
12 | from bs4 import BeautifulSoup | 13 | from bs4 import BeautifulSoup |
13 | from datetime import datetime, timedelta | 14 | from datetime import datetime, timedelta |
14 | 15 | ||
... | @@ -22,10 +23,19 @@ def Click(xpath, driver): | ... | @@ -22,10 +23,19 @@ def Click(xpath, driver): |
22 | driver.execute_script("arguments[0].click();", element) | 23 | driver.execute_script("arguments[0].click();", element) |
23 | sleeptime() | 24 | sleeptime() |
24 | 25 | ||
26 | +def TextPreprocess(text): | ||
27 | + text = re.sub(pattern='[^\w\s]', repl='', string=text) | ||
28 | + text = re.sub(pattern='\n', repl='. ', string=text) | ||
29 | + | ||
30 | + spelled_sent = spell_checker.check(text) | ||
31 | + text = spelled_sent.checked | ||
32 | + | ||
33 | + return text | ||
34 | + | ||
25 | def GetData(): | 35 | def GetData(): |
26 | login_info = { | 36 | login_info = { |
27 | - 'userID' : '********', | 37 | + 'userID' : 'qdw0313', |
28 | - 'userpw' : '********' | 38 | + 'userpw' : 'fejUfrQxHWwtcGcP0' |
29 | } | 39 | } |
30 | 40 | ||
31 | options = webdriver.ChromeOptions() | 41 | options = webdriver.ChromeOptions() |
... | @@ -80,7 +90,8 @@ def GetData(): | ... | @@ -80,7 +90,8 @@ def GetData(): |
80 | DateList = soup.select('#container > div.wrap.articles > article > a > time') | 90 | DateList = soup.select('#container > div.wrap.articles > article > a > time') |
81 | 91 | ||
82 | for post in zip(TitleList, DateList): | 92 | for post in zip(TitleList, DateList): |
83 | - title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) | 93 | + title = TextPreprocess(post[0].text) |
94 | + | ||
84 | try: | 95 | try: |
85 | Click("//h2[contains(text(), '{}')]".format(title), driver) | 96 | Click("//h2[contains(text(), '{}')]".format(title), driver) |
86 | except NoSuchElementException: | 97 | except NoSuchElementException: |
... | @@ -91,8 +102,7 @@ def GetData(): | ... | @@ -91,8 +102,7 @@ def GetData(): |
91 | 102 | ||
92 | if not (post_df['title'] == title).any(): | 103 | if not (post_df['title'] == title).any(): |
93 | # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) | 104 | # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) |
94 | - content = re.sub(pattern='[^\w\s]', repl='', string=content) | 105 | + content = TextPreprocess(content) |
95 | - content = re.sub(pattern='\n', repl=' ', string=content) | ||
96 | post_df = post_df.append(pd.DataFrame([[title, content]], | 106 | post_df = post_df.append(pd.DataFrame([[title, content]], |
97 | columns=['title', 'content'])) | 107 | columns=['title', 'content'])) |
98 | # print("{0}. {1} : {2}".format(idx, title, content)) | 108 | # print("{0}. {1} : {2}".format(idx, title, content)) |
... | @@ -111,7 +121,14 @@ def GetData(): | ... | @@ -111,7 +121,14 @@ def GetData(): |
111 | 121 | ||
112 | with open('data.json', 'w+', encoding='utf-8-sig') as json_file: | 122 | with open('data.json', 'w+', encoding='utf-8-sig') as json_file: |
113 | for post in zip(post_df['title'].tolist(), post_df['content'].tolist()): | 123 | for post in zip(post_df['title'].tolist(), post_df['content'].tolist()): |
114 | - json.dump(post[0] + post[1], json_file, ensure_ascii=False) | 124 | + json.dump({ |
125 | + "document" : | ||
126 | + { | ||
127 | + "type" : "PLAIN_TEXT", | ||
128 | + "content" : post[0] + post[1] | ||
129 | + }, | ||
130 | + "encodingType" : "UTF8" | ||
131 | + }, json_file, ensure_ascii=False) | ||
115 | print("JSON file saved") | 132 | print("JSON file saved") |
116 | 133 | ||
117 | GetData() | 134 | GetData() | ... | ... |
-
Please register or login to post a comment