박은주

Make Func

......@@ -9,6 +9,7 @@ import numpy as np
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from hanspell import spell_checker
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
......@@ -22,10 +23,19 @@ def Click(xpath, driver):
driver.execute_script("arguments[0].click();", element)
sleeptime()
def TextPreprocess(text):
text = re.sub(pattern='[^\w\s]', repl='', string=text)
text = re.sub(pattern='\n', repl='. ', string=text)
spelled_sent = spell_checker.check(text)
text = spelled_sent.checked
return text
def GetData():
login_info = {
'userID' : '********',
'userpw' : '********'
'userID' : 'qdw0313',
'userpw' : 'fejUfrQxHWwtcGcP0'
}
options = webdriver.ChromeOptions()
......@@ -80,7 +90,8 @@ def GetData():
DateList = soup.select('#container > div.wrap.articles > article > a > time')
for post in zip(TitleList, DateList):
title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
title = TextPreprocess(post[0].text)
try:
Click("//h2[contains(text(), '{}')]".format(title), driver)
except NoSuchElementException:
......@@ -91,8 +102,7 @@ def GetData():
if not (post_df['title'] == title).any():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content = re.sub(pattern='[^\w\s]', repl='', string=content)
content = re.sub(pattern='\n', repl=' ', string=content)
content = TextPreprocess(content)
post_df = post_df.append(pd.DataFrame([[title, content]],
columns=['title', 'content']))
# print("{0}. {1} : {2}".format(idx, title, content))
......@@ -111,7 +121,14 @@ def GetData():
with open('data.json', 'w+', encoding='utf-8-sig') as json_file:
for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
json.dump(post[0] + post[1], json_file, ensure_ascii=False)
json.dump({
"document" :
{
"type" : "PLAIN_TEXT",
"content" : post[0] + post[1]
},
"encodingType" : "UTF8"
}, json_file, ensure_ascii=False)
print("JSON file saved")
GetData()
......