박은주

Modefied json storage method

Showing 1 changed file with 27 additions and 13 deletions
1 import csv 1 import csv
2 +import json
2 import time 3 import time
3 import random 4 import random
4 import os 5 import os
5 import re 6 import re
6 import pandas as pd 7 import pandas as pd
8 +import numpy as np
7 9
8 from selenium import webdriver 10 from selenium import webdriver
9 from selenium.common.exceptions import NoSuchElementException 11 from selenium.common.exceptions import NoSuchElementException
...@@ -15,15 +17,15 @@ BASE_DIR = os.path.dirname(os.path.realpath(__file__)) ...@@ -15,15 +17,15 @@ BASE_DIR = os.path.dirname(os.path.realpath(__file__))
15 def sleeptime(): 17 def sleeptime():
16 time.sleep(random.randint(1, 3)) 18 time.sleep(random.randint(1, 3))
17 19
18 -def Click(xpath): 20 +def Click(xpath, driver):
19 element = driver.find_element_by_xpath(xpath) 21 element = driver.find_element_by_xpath(xpath)
20 driver.execute_script("arguments[0].click();", element) 22 driver.execute_script("arguments[0].click();", element)
21 sleeptime() 23 sleeptime()
22 24
23 def GetData(): 25 def GetData():
24 login_info = { 26 login_info = {
25 - 'userID' : 'qdw0313', 27 + 'userID' : '********',
26 - 'userpw' : 'fejUfrQxHWwtcGcP0' 28 + 'userpw' : '********'
27 } 29 }
28 30
29 options = webdriver.ChromeOptions() 31 options = webdriver.ChromeOptions()
...@@ -59,17 +61,17 @@ def GetData(): ...@@ -59,17 +61,17 @@ def GetData():
59 post_df = pd.DataFrame(columns=['title', 'content']) 61 post_df = pd.DataFrame(columns=['title', 'content'])
60 while swt: 62 while swt:
61 if page < 1: 63 if page < 1:
62 - Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a') 64 + Click('//*[@id="submenu"]/div/div[2]/ul/li[1]/a', driver)
63 page += 1 65 page += 1
64 else: 66 else:
65 if page == 1: 67 if page == 1:
66 - Click('//*[@id="container"]/div[2]/div[2]/a') 68 + Click('//*[@id="container"]/div[2]/div[2]/a', driver)
67 page += 1 69 page += 1
68 elif page == 2: 70 elif page == 2:
69 - Click('//*[@id="container"]/div[2]/div[2]/a[2]') 71 + Click('//*[@id="container"]/div[2]/div[2]/a[2]', driver)
70 page += 1 72 page += 1
71 else: 73 else:
72 - Click('//*[@id="container"]/div[2]/div[2]/a[3]') 74 + Click('//*[@id="container"]/div[2]/div[2]/a[3]', driver)
73 75
74 html = driver.page_source 76 html = driver.page_source
75 soup = BeautifulSoup(html, 'html.parser') 77 soup = BeautifulSoup(html, 'html.parser')
...@@ -80,7 +82,7 @@ def GetData(): ...@@ -80,7 +82,7 @@ def GetData():
80 for post in zip(TitleList, DateList): 82 for post in zip(TitleList, DateList):
81 title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text) 83 title = re.sub(pattern='[^\w\s]', repl='', string=post[0].text)
82 try: 84 try:
83 - Click("//h2[contains(text(), '{}')]".format(title)) 85 + Click("//h2[contains(text(), '{}')]".format(title), driver)
84 except NoSuchElementException: 86 except NoSuchElementException:
85 continue 87 continue
86 content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text 88 content = driver.find_element_by_xpath('//*[@id="container"]/div[2]/article/a/p').text
...@@ -90,17 +92,29 @@ def GetData(): ...@@ -90,17 +92,29 @@ def GetData():
90 if not (post_df['title'] == title).any(): 92 if not (post_df['title'] == title).any():
91 # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx)) 93 # Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
92 content = re.sub(pattern='[^\w\s]', repl='', string=content) 94 content = re.sub(pattern='[^\w\s]', repl='', string=content)
95 + content = re.sub(pattern='\n', repl=' ', string=content)
93 post_df = post_df.append(pd.DataFrame([[title, content]], 96 post_df = post_df.append(pd.DataFrame([[title, content]],
94 columns=['title', 'content'])) 97 columns=['title', 'content']))
95 # print("{0}. {1} : {2}".format(idx, title, content)) 98 # print("{0}. {1} : {2}".format(idx, title, content))
96 - print(post[1].text)
97 - print(yesterday < "06/02 16:35")
98 - exit()
99 if post[1].text <= yesterday: 99 if post[1].text <= yesterday:
100 break 100 break
101 + break
101 102
102 post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False) 103 post_df.to_csv('data.csv', mode='w', encoding='utf-8-sig', index=False)
103 print("CVS file saved") 104 print("CVS file saved")
104 - with open('data.json', 'w', encoding='utf-8') as file: 105 + # print(post_df)
105 - post_df.to_json(file, force_ascii=False) 106 + # exit()
107 + # post_df.reset_index(drop=True, inplace=True)
108 + # post_df.to_json('data.json')
109 + # # with open('data.json', 'w', encoding='utf-8-sig') as file:
110 + # # post_df.to_json(file, force_ascii=False)
111 +
112 + with open('data.json', 'w+', encoding='utf-8-sig') as json_file:
113 + for post in zip(post_df['title'].tolist(), post_df['content'].tolist()):
114 + json.dump(post[0] + post[1], json_file, ensure_ascii=False)
106 print("JSON file saved") 115 print("JSON file saved")
116 +
117 +GetData()
118 +######## TODO: JSON으로 저장
119 +######## 형식 : { "document" : { "type" : "PLAIN_TEXT", "content" : "~~" }, "encodingType" : "UTF8" }
120 +######## GOOGLE Sentiment Analyzer 사용을 위해
...\ No newline at end of file ...\ No newline at end of file
......