Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-07 16:32:46 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
8c962e1853e1cb8a8e10e293e0829effde065219
8c962e18
1 parent
52ff23d7
Modefied json storage method
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
14 deletions
content.py
content.py
View file @
8c962e1
import
csv
import
json
import
time
import
random
import
os
import
re
import
pandas
as
pd
import
numpy
as
np
from
selenium
import
webdriver
from
selenium.common.exceptions
import
NoSuchElementException
...
...
@@ -15,15 +17,15 @@ BASE_DIR = os.path.dirname(os.path.realpath(__file__))
def
sleeptime
():
time
.
sleep
(
random
.
randint
(
1
,
3
))
def
Click
(
xpath
):
def
Click
(
xpath
,
driver
):
element
=
driver
.
find_element_by_xpath
(
xpath
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
def
GetData
():
login_info
=
{
'userID'
:
'
qdw0313
'
,
'userpw'
:
'
fejUfrQxHWwtcGcP0
'
'userID'
:
'
********
'
,
'userpw'
:
'
********
'
}
options
=
webdriver
.
ChromeOptions
()
...
...
@@ -59,17 +61,17 @@ def GetData():
post_df
=
pd
.
DataFrame
(
columns
=
[
'title'
,
'content'
])
while
swt
:
if
page
<
1
:
Click
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
Click
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
,
driver
)
page
+=
1
else
:
if
page
==
1
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a'
)
Click
(
'//*[@id="container"]/div[2]/div[2]/a'
,
driver
)
page
+=
1
elif
page
==
2
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
Click
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
,
driver
)
page
+=
1
else
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
Click
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
,
driver
)
html
=
driver
.
page_source
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
...
...
@@ -80,7 +82,7 @@ def GetData():
for
post
in
zip
(
TitleList
,
DateList
):
title
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
post
[
0
]
.
text
)
try
:
Click
(
"//h2[contains(text(), '{}')]"
.
format
(
title
))
Click
(
"//h2[contains(text(), '{}')]"
.
format
(
title
)
,
driver
)
except
NoSuchElementException
:
continue
content
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/article/a/p'
)
.
text
...
...
@@ -90,17 +92,29 @@ def GetData():
if
not
(
post_df
[
'title'
]
==
title
)
.
any
():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
content
)
content
=
re
.
sub
(
pattern
=
'
\n
'
,
repl
=
' '
,
string
=
content
)
post_df
=
post_df
.
append
(
pd
.
DataFrame
([[
title
,
content
]],
columns
=
[
'title'
,
'content'
]))
# print("{0}. {1} : {2}".format(idx, title, content))
print
(
post
[
1
]
.
text
)
print
(
yesterday
<
"06/02 16:35"
)
exit
()
if
post
[
1
]
.
text
<=
yesterday
:
break
break
post_df
.
to_csv
(
'data.csv'
,
mode
=
'w'
,
encoding
=
'utf-8-sig'
,
index
=
False
)
print
(
"CVS file saved"
)
with
open
(
'data.json'
,
'w'
,
encoding
=
'utf-8'
)
as
file
:
post_df
.
to_json
(
file
,
force_ascii
=
False
)
print
(
"JSON file saved"
)
\ No newline at end of file
# print(post_df)
# exit()
# post_df.reset_index(drop=True, inplace=True)
# post_df.to_json('data.json')
# # with open('data.json', 'w', encoding='utf-8-sig') as file:
# # post_df.to_json(file, force_ascii=False)
with
open
(
'data.json'
,
'w+'
,
encoding
=
'utf-8-sig'
)
as
json_file
:
for
post
in
zip
(
post_df
[
'title'
]
.
tolist
(),
post_df
[
'content'
]
.
tolist
()):
json
.
dump
(
post
[
0
]
+
post
[
1
],
json_file
,
ensure_ascii
=
False
)
print
(
"JSON file saved"
)
GetData
()
######## TODO: JSON으로 저장
######## 형식 : { "document" : { "type" : "PLAIN_TEXT", "content" : "~~" }, "encodingType" : "UTF8" }
######## GOOGLE Sentiment Analyzer 사용을 위해
\ No newline at end of file
...
...
Please
register
or
login
to post a comment