Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-08 10:30:45 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
f0a13f1e569002aa5f9afe243ec5f2e3af3cc9e6
f0a13f1e
1 parent
04981fcd
Make Func
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
6 deletions
content.py
content.py
View file @
f0a13f1
...
...
@@ -9,6 +9,7 @@ import numpy as np
from
selenium
import
webdriver
from
selenium.common.exceptions
import
NoSuchElementException
from
hanspell
import
spell_checker
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
,
timedelta
...
...
@@ -22,10 +23,19 @@ def Click(xpath, driver):
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
def
TextPreprocess
(
text
):
text
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
text
)
text
=
re
.
sub
(
pattern
=
'
\n
'
,
repl
=
'. '
,
string
=
text
)
spelled_sent
=
spell_checker
.
check
(
text
)
text
=
spelled_sent
.
checked
return
text
def
GetData
():
login_info
=
{
'userID'
:
'
********
'
,
'userpw'
:
'
********
'
'userID'
:
'
qdw0313
'
,
'userpw'
:
'
fejUfrQxHWwtcGcP0
'
}
options
=
webdriver
.
ChromeOptions
()
...
...
@@ -80,7 +90,8 @@ def GetData():
DateList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > time'
)
for
post
in
zip
(
TitleList
,
DateList
):
title
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
post
[
0
]
.
text
)
title
=
TextPreprocess
(
post
[
0
]
.
text
)
try
:
Click
(
"//h2[contains(text(), '{}')]"
.
format
(
title
),
driver
)
except
NoSuchElementException
:
...
...
@@ -91,8 +102,7 @@ def GetData():
if
not
(
post_df
[
'title'
]
==
title
)
.
any
():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
content
)
content
=
re
.
sub
(
pattern
=
'
\n
'
,
repl
=
' '
,
string
=
content
)
content
=
TextPreprocess
(
content
)
post_df
=
post_df
.
append
(
pd
.
DataFrame
([[
title
,
content
]],
columns
=
[
'title'
,
'content'
]))
# print("{0}. {1} : {2}".format(idx, title, content))
...
...
@@ -111,7 +121,14 @@ def GetData():
with
open
(
'data.json'
,
'w+'
,
encoding
=
'utf-8-sig'
)
as
json_file
:
for
post
in
zip
(
post_df
[
'title'
]
.
tolist
(),
post_df
[
'content'
]
.
tolist
()):
json
.
dump
(
post
[
0
]
+
post
[
1
],
json_file
,
ensure_ascii
=
False
)
json
.
dump
({
"document"
:
{
"type"
:
"PLAIN_TEXT"
,
"content"
:
post
[
0
]
+
post
[
1
]
},
"encodingType"
:
"UTF8"
},
json_file
,
ensure_ascii
=
False
)
print
(
"JSON file saved"
)
GetData
()
...
...
Please
register
or
login
to post a comment