박은주

Make Func

...@@ -9,6 +9,7 @@ import numpy as np ...@@ -9,6 +9,7 @@ import numpy as np
9 9
10 from selenium import webdriver 10 from selenium import webdriver
11 from selenium.common.exceptions import NoSuchElementException 11 from selenium.common.exceptions import NoSuchElementException
12 +from hanspell import spell_checker
12 from bs4 import BeautifulSoup 13 from bs4 import BeautifulSoup
13 from datetime import datetime, timedelta 14 from datetime import datetime, timedelta
14 15
...@@ -22,10 +23,19 @@ def Click(xpath, driver): ...@@ -22,10 +23,19 @@ def Click(xpath, driver):
22 driver.execute_script("arguments[0].click();", element) 23 driver.execute_script("arguments[0].click();", element)
23 sleeptime() 24 sleeptime()
24 25
26 +def TextPreprocess(text):
27 + text = re.sub(pattern='[^\w\s]', repl='', string=text)
28 + text = re.sub(pattern='\n', repl='. ', string=text)
29 +
30 + spelled_sent = spell_checker.check(text)
31 + text = spelled_sent.checked
32 +
33 + return text
34 +
25 def GetData(): 35 def GetData():
26 login_info = { 36 login_info = {
27 - 'userID' : '********', 37 + 'userID' : 'qdw0313',
28 - 'userpw' : '********' 38 + 'userpw' : 'fejUfrQxHWwtcGcP0'
29 } 39 }
30 40
31 options = webdriver.ChromeOptions() 41 options = webdriver.ChromeOptions()
...@@ -80,7 +90,8 @@ def GetData(): ...@@ -80,7 +90,8 @@ def GetData():
80 DateList = soup.select('#container > div.wrap.articles > article > a > time') 90 DateList = soup.select('#container > div.wrap.articles > article > a > time')
81 91
82 for post in zip(TitleList, DateList): 92 for post in zip(TitleList, DateList):
83 - title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) 93 + title = TextPreprocess(post[0].text)
94 +
84 try: 95 try:
85 Click("//h2[contains(text(), '{}')]".format(title), driver) 96 Click("//h2[contains(text(), '{}')]".format(title), driver)
86 except NoSuchElementException: 97 except NoSuchElementException:
...@@ -91,8 +102,7 @@ def GetData(): ...@@ -91,8 +102,7 @@ def GetData():
91 102
92 if not (post_df['title'] == title).any(): 103 if not (post_df['title'] == title).any():
93 # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) 104 # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
94 - content = re.sub(pattern='[^\w\s]', repl='', string=content) 105 + content = TextPreprocess(content)
95 - content = re.sub(pattern='\n', repl=' ', string=content)
96 post_df = post_df.append(pd.DataFrame([[title, content]], 106 post_df = post_df.append(pd.DataFrame([[title, content]],
97 columns=['title', 'content'])) 107 columns=['title', 'content']))
98 # print("{0}. {1} : {2}".format(idx, title, content)) 108 # print("{0}. {1} : {2}".format(idx, title, content))
...@@ -111,7 +121,14 @@ def GetData(): ...@@ -111,7 +121,14 @@ def GetData():
111 121
112 with open('data.json', 'w+', encoding='utf-8-sig') as json_file: 122 with open('data.json', 'w+', encoding='utf-8-sig') as json_file:
113 for post in zip(post_df['title'].tolist(), post_df['content'].tolist()): 123 for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
114 - json.dump(post[0] + post[1], json_file, ensure_ascii=False) 124 + json.dump({
125 + "document" :
126 + {
127 + "type" : "PLAIN_TEXT",
128 + "content" : post[0] + post[1]
129 + },
130 + "encodingType" : "UTF8"
131 + }, json_file, ensure_ascii=False)
115 print("JSON file saved") 132 print("JSON file saved")
116 133
117 GetData() 134 GetData()
......