Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-03 13:57:43 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
0bf5691f99c0245be44981583820c032e1767d8f
0bf5691f
1 parent
b29f9e71
수정: 전문 크롤링
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
51 additions
and
26 deletions
.gitignore
content.py
.gitignore
View file @
0bf5691
...
...
@@ -14,3 +14,9 @@ chromedriver.exe
*.iml
*.csv
*.xml
*.json
*.png
*.org
/KoreanSentimentAnalyzer/.gitignore
/KoreanSentimentAnalyzer/.git/
\ No newline at end of file
...
...
content.py
View file @
0bf5691
...
...
@@ -2,19 +2,25 @@ import csv
import
time
import
random
import
os
import
re
import
pandas
as
pd
from
selenium
import
webdriver
from
selenium.common.exceptions
import
NoSuchElementException
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
,
timedelta
\
from
datetime
import
datetime
,
timedelta
def
sleeptime
():
rand
=
random
.
uniform
(
1
,
3
)
time
.
sleep
(
rand
)
time
.
sleep
(
random
.
randint
(
1
,
3
))
def
Click
(
xpath
):
element
=
driver
.
find_element_by_xpath
(
xpath
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
login_info
=
{
'userID'
:
'
id
'
,
'userpw'
:
'
passwd
'
'userID'
:
'
qdw0313
'
,
'userpw'
:
'
fejUfrQxHWwtcGcP0
'
}
options
=
webdriver
.
ChromeOptions
()
...
...
@@ -41,44 +47,57 @@ sleeptime()
# 국제캠 자게
sleeptime
()
posts
=
[]
yesterday
=
(
datetime
.
today
()
-
timedelta
(
1
))
.
strftime
(
'
%
m/
%
d
%
H:
%
M'
)
print
(
yesterday
)
swt
=
True
page
=
1
page
=
0
post_df
=
pd
.
DataFrame
(
columns
=
[
'title'
,
'content'
])
while
swt
:
if
not
posts
:
driver
.
find_element_by_xpath
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
.
click
()
if
page
<
1
:
Click
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
page
+=
1
else
:
if
page
==
1
:
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/div[2]/a'
)
.
click
(
)
Click
(
'//*[@id="container"]/div[2]/div[2]/a'
)
page
+=
1
elif
page
==
2
:
element
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
Click
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
page
+=
1
else
:
element
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
Click
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
sleeptime
()
html
=
driver
.
page_source
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
TitleList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > h2'
)
ContentList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > p'
)
DateList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > time'
)
# ContentList = soup.select('#container > div.wrap.articles > article > a > p')
# idx = 1
# for post in zip(TitleList, ContentList, DateList):
for
post
in
zip
(
TitleList
,
DateList
):
title
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
post
[
0
]
.
text
)
try
:
Click
(
"//h2[contains(text(), '{}')]"
.
format
(
title
))
except
NoSuchElementException
:
continue
content
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/article/a/p'
)
.
text
driver
.
back
()
sleeptime
()
for
post
in
zip
(
TitleList
,
ContentList
,
DateList
):
posts
.
append
([
post
[
0
]
.
text
,
post
[
1
]
.
text
])
if
post
[
2
]
.
text
==
yesterday
:
if
not
(
post_df
[
'title'
]
==
title
)
.
any
():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
content
)
post_df
=
post_df
.
append
(
pd
.
DataFrame
([[
title
,
content
]],
columns
=
[
'title'
,
'content'
]))
# print("{0}. {1} : {2}".format(idx, title, content))
print
(
post
[
1
]
.
text
)
if
post
[
1
]
.
text
<
yesterday
:
swt
=
False
break
BASE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'w+'
,
encoding
=
'utf-8-sig'
,
newline
=
''
)
as
file
:
writer
=
csv
.
writer
(
file
)
for
idx
in
range
(
len
(
posts
)):
writer
.
writerow
(
posts
[
idx
])
\ No newline at end of file
post_df
.
to_csv
(
'data.csv'
,
mode
=
'w'
,
encoding
=
'utf-8-sig'
)
print
(
"CVS file saved"
)
post_df
.
to_json
(
'data.json'
,
orient
=
'records'
,
encoding
=
'utf-8-sig'
)
print
(
"JSON file saved"
)
\ No newline at end of file
...
...
Please
register
or
login
to post a comment