Make Func

박은주
Commit f0a13f1e569002aa5f9afe243ec5f2e3af3cc9e6 f0a13f1e 1 parent 04981fcd
Showing 1 changed file with 23 additions and 6 deletions
content.py
--- a/content.py
View file @f0a13f1
+++ b/content.py
View file @f0a13f1
@@ -9,6 +9,7 @@ import numpy as np
 
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
+ from hanspell import spell_checker
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
 
@@ -22,10 +23,19 @@ def Click(xpath, driver):
     driver.execute_script("arguments[0].click();", element)
     sleeptime()
 
+ def TextPreprocess(text):
+     text = re.sub(pattern='[^\w\s]', repl='', string=text)
+     text = re.sub(pattern='\n', repl='. ', string=text)
+ 
+     spelled_sent = spell_checker.check(text)
+     text = spelled_sent.checked
+ 
+     return text
+ 
 def GetData():
     login_info = {
-         'userID' : '********',
-         'userpw' : '********'
+         'userID' : 'qdw0313',
+         'userpw' : 'fejUfrQxHWwtcGcP0'
     }
 
     options = webdriver.ChromeOptions()
@@ -80,7 +90,8 @@ def GetData():
         DateList = soup.select('#container > div.wrap.articles > article > a > time')
 
         for post in zip(TitleList, DateList):
-             title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
+             title = TextPreprocess(post[0].text)
+ 
             try:
                 Click("//h2[contains(text(), '{}')]".format(title), driver)
             except NoSuchElementException:
@@ -91,8 +102,7 @@ def GetData():
 
             if not (post_df['title'] == title).any():
                 # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
-                 content = re.sub(pattern='[^\w\s]', repl='', string=content)
-                 content = re.sub(pattern='\n', repl=' ', string=content)
+                 content = TextPreprocess(content)
                 post_df = post_df.append(pd.DataFrame([[title, content]],
                                                       columns=['title', 'content']))
                 # print("{0}. {1} : {2}".format(idx, title, content))
@@ -111,7 +121,14 @@ def GetData():
 
     with open('data.json', 'w+', encoding='utf-8-sig') as json_file:
         for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
-             json.dump(post[0] + post[1], json_file, ensure_ascii=False)
+             json.dump({
+                 "document" :
+                     {
+                         "type" : "PLAIN_TEXT",
+                         "content" : post[0] + post[1]
+                     },
+                 "encodingType" : "UTF8"
+             }, json_file, ensure_ascii=False)
     print("JSON file saved")
 
 GetData()