박은주

Modefied json storage method

Showing 1 changed file with 28 additions and 14 deletions
import csv
import json
import time
import random
import os
import re
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
......@@ -15,15 +17,15 @@ BASE_DIR = os.path.dirname(os.path.realpath(__file__))
def sleeptime():
time.sleep(random.randint(1, 3))
def Click(xpath):
def Click(xpath, driver):
element = driver.find_element_by_xpath(xpath)
driver.execute_script("arguments[0].click();", element)
sleeptime()
def GetData():
login_info = {
'userID' : 'qdw0313',
'userpw' : 'fejUfrQxHWwtcGcP0'
'userID' : '********',
'userpw' : '********'
}
options = webdriver.ChromeOptions()
......@@ -59,17 +61,17 @@ def GetData():
post_df = pd.DataFrame(columns=['title', 'content'])
while swt:
if page < 1:
Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a')
Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a', driver)
page += 1
else:
if page == 1:
Click('//*[@id="container"]/div[2]/div[2]/a')
Click('//*[@id="container"]/div[2]/div[2]/a', driver)
page += 1
elif page == 2:
Click('//*[@id="container"]/div[2]/div[2]/a[2]')
Click('//*[@id="container"]/div[2]/div[2]/a[2]', driver)
page += 1
else:
Click('//*[@id="container"]/div[2]/div[2]/a[3]')
Click('//*[@id="container"]/div[2]/div[2]/a[3]', driver)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
......@@ -80,7 +82,7 @@ def GetData():
for post in zip(TitleList, DateList):
title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
try:
Click("//h2[contains(text(), '{}')]".format(title))
Click("//h2[contains(text(), '{}')]".format(title), driver)
except NoSuchElementException:
continue
content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
......@@ -90,17 +92,29 @@ def GetData():
if not (post_df['title'] == title).any():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content = re.sub(pattern='[^\w\s]', repl='', string=content)
content = re.sub(pattern='\n', repl=' ', string=content)
post_df = post_df.append(pd.DataFrame([[title, content]],
columns=['title', 'content']))
# print("{0}. {1} : {2}".format(idx, title, content))
print(post[1].text)
print(yesterday < "06/02 16:35")
exit()
if post[1].text <= yesterday:
break
break
post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False)
print("CVS file saved")
with open('data.json', 'w', encoding='utf-8') as file:
post_df.to_json(file, force_ascii=False)
print("JSON file saved")
\ No newline at end of file
# print(post_df)
# exit()
# post_df.reset_index(drop=True, inplace=True)
# post_df.to_json('data.json')
# # with open('data.json', 'w', encoding='utf-8-sig') as file:
# # post_df.to_json(file, force_ascii=False)
with open('data.json', 'w+', encoding='utf-8-sig') as json_file:
for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
json.dump(post[0] + post[1], json_file, ensure_ascii=False)
print("JSON file saved")
GetData()
######## TODO: JSON으로 저장
######## 형식 : { "document" : { "type" : "PLAIN_TEXT", "content" : "~~" }, "encodingType" : "UTF8" }
######## GOOGLE Sentiment Analyzer 사용을 위해
\ No newline at end of file
......