Make Func

박은주
Commit f0a13f1e569002aa5f9afe243ec5f2e3af3cc9e6 f0a13f1e 1 parent 04981fcd
Showing 1 changed file with 23 additions and 6 deletions
content.py
--- a/content.py
View file @f0a13f1
+++ b/content.py
View file @f0a13f1
@@ -9,6 +9,7 @@ import numpy as np
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
+from hanspell import spell_checker
 from bs4 import BeautifulSoup
 from datetime import datetime, timedelta
@@ -22,10 +23,19 @@ def Click(xpath, driver):
     driver.execute_script("arguments[0].click();", element)
     sleeptime()
+def TextPreprocess(text):
+    text = re.sub(pattern='[^\w\s]', repl='', string=text)
+    text = re.sub(pattern='\n', repl='. ', string=text)
+
+    spelled_sent = spell_checker.check(text)
+    text = spelled_sent.checked
+
+    return text
+
 def GetData():
     login_info = {
-        'userID' : '********',
+        'userID' : 'qdw0313',
-        'userpw' : '********'
+        'userpw' : 'fejUfrQxHWwtcGcP0'
     }
     options = webdriver.ChromeOptions()
@@ -80,7 +90,8 @@ def GetData():
         DateList = soup.select('#container > div.wrap.articles > article > a > time')
         for post in zip(TitleList, DateList):
-            title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
+            title = TextPreprocess(post[0].text)
+
             try:
                 Click("//h2[contains(text(), '{}')]".format(title), driver)
             except NoSuchElementException:
@@ -91,8 +102,7 @@ def GetData():
             if not (post_df['title'] == title).any():
                 # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
-                content = re.sub(pattern='[^\w\s]', repl='', string=content)
+                content = TextPreprocess(content)
-                content = re.sub(pattern='\n', repl=' ', string=content)
                 post_df = post_df.append(pd.DataFrame([[title, content]],
                                                       columns=['title', 'content']))
                 # print("{0}. {1} : {2}".format(idx, title, content))
@@ -111,7 +121,14 @@ def GetData():
     with open('data.json', 'w+', encoding='utf-8-sig') as json_file:
         for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
-            json.dump(post[0] + post[1], json_file, ensure_ascii=False)
+            json.dump({
+                "document" :
+                    {
+                        "type" : "PLAIN_TEXT",
+                        "content" : post[0] + post[1]
+                    },
+                "encodingType" : "UTF8"
+            }, json_file, ensure_ascii=False)
     print("JSON file saved")
 GetData()