Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-02 14:55:04 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
b17f3f7640858c4a6781a25b81d7e07f3dc63f37
b17f3f76
1 parent
b29f9e71
Add file via upload
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
20 deletions
content.py
content.py
View file @
b17f3f7
...
...
@@ -2,6 +2,7 @@ import csv
import
time
import
random
import
os
import
pandas
as
pd
from
selenium
import
webdriver
from
bs4
import
BeautifulSoup
...
...
@@ -11,14 +12,18 @@ def sleeptime():
rand
=
random
.
uniform
(
1
,
3
)
time
.
sleep
(
rand
)
def
Click
(
xpath
):
element
=
driver
.
find_element_by_xpath
(
xpath
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
login_info
=
{
'userID'
:
'id'
,
'userpw'
:
'p
asswd
'
'userpw'
:
'p
w
'
}
options
=
webdriver
.
ChromeOptions
()
options
.
add_argument
(
'headless'
)
#
options.add_argument('headless')
options
.
add_argument
(
'no-sandbox'
)
options
.
add_argument
(
'window-size=1920x1080'
)
options
.
add_argument
(
'disable-gpu'
)
...
...
@@ -26,7 +31,7 @@ options.add_argument('disable-dev-shm-usage')
options
.
add_argument
(
'lang=ko_KR'
)
options
.
add_argument
(
'user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47'
)
driver
=
webdriver
.
Chrome
(
r'C:\Users\
Admin\Desktop\OSS\Todays_Issue
\chromedriver.exe'
,
options
=
options
)
driver
=
webdriver
.
Chrome
(
r'C:\Users\
E_N__\Desktop
\chromedriver.exe'
,
options
=
options
)
driver
.
get
(
'about:blank'
)
driver
.
execute_script
(
"Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});"
)
...
...
@@ -46,39 +51,43 @@ yesterday = (datetime.today() - timedelta(1)).strftime('%m/%d %H:%M')
swt
=
True
page
=
1
post_df
=
pd
.
DataFrame
(
columns
=
[
'title'
,
'content'
])
while
swt
:
if
not
posts
:
driver
.
find_element_by_xpath
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
.
click
(
)
Click
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
else
:
if
page
==
1
:
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/div[2]/a'
)
.
click
(
)
Click
(
'//*[@id="container"]/div[2]/div[2]/a'
)
page
+=
1
elif
page
==
2
:
element
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
Click
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
page
+=
1
else
:
element
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
Click
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
sleeptime
()
html
=
driver
.
page_source
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
TitleList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > h2'
)
ContentList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > p'
)
DateList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > time'
)
ContentList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > p'
)
idx
=
1
for
post
in
zip
(
TitleList
,
ContentList
,
DateList
):
posts
.
append
([
post
[
0
]
.
text
,
post
[
1
]
.
text
])
if
post
[
2
]
.
text
==
yesterday
:
Click
(
'//*[@id="container"]/div[2]/article[{}]'
.
format
(
idx
))
content
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/article/a/p'
)
.
text
sleeptime
()
idx
+=
1
post_df
=
post_df
.
append
(
pd
.
DataFrame
([
post
[
0
]
.
text
,
content
],
columns
=
[
'title'
,
'content'
]))
if
post
[
2
]
.
text
<
yesterday
:
swt
=
False
break
BASE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'w+'
,
encoding
=
'utf-8-sig'
,
newline
=
''
)
as
file
:
writer
=
csv
.
writer
(
file
)
for
idx
in
range
(
len
(
posts
)):
writer
.
writerow
(
posts
[
idx
])
\ No newline at end of file
post_df
.
to_csv
(
'data.csv'
,
mode
=
'w'
,
encoding
=
'utf-8-sig'
)
# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# with open(os.path.join(BASE_DIR + '/', 'data.csv'), 'w+', encoding='utf-8-sig', newline='') as file:
# writer = csv.writer(file)
# for idx in range(len(posts)):
# writer.writerow(posts[idx])
\ No newline at end of file
...
...
Please
register
or
login
to post a comment