Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-04 09:31:33 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
67add56ec023f8ad07ad750e0e52b75a7cb93bb7
67add56e
1 parent
489e0a3e
Crawler Modified
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
101 additions
and
109 deletions
.gitignore
GetTopic.py
content.py
.gitignore
View file @
67add56
...
...
@@ -15,7 +15,6 @@ chromedriver.exe
*.csv
*.xml
*.json
*.png
*.org
/KoreanSentimentAnalyzer/.gitignore
...
...
GetTopic.py
View file @
67add56
...
...
@@ -2,9 +2,6 @@ import os
import
csv
import
re
from
sklearn.metrics.pairwise
import
cosine_similarity
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.cluster
import
KMeans
from
konlpy.tag
import
Okt
from
konlpy.tag
import
Komoran
...
...
@@ -13,40 +10,30 @@ from textrank import KeywordSummarizer
okt
=
Okt
()
def
Okt_tokenizer
(
sent
):
words
=
okt
.
nouns
(
sent
)
# words = okt.pos(sent, join=True, stem=True)
# words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
return
words
komoran
=
Komoran
()
def
komoran_tokenizer
(
sent
):
# words = []
# for sentence in sent:
# words += komoran.pos(sentence, join=True)
# print("check : ", komoran.pos(sentence, join=True))
# words = [komoran.pos(sentence, join=True) for sentence in sent]
words
=
komoran
.
pos
(
sent
,
join
=
True
)
words
=
[
w
for
w
in
words
if
(
'/NNG'
in
w
or
'/NNP'
in
w
)]
return
words
BASE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
posts
=
[]
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'r'
,
encoding
=
'utf-8'
)
as
db
:
reader
=
csv
.
reader
(
db
)
for
data
in
reader
:
data
[
0
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
0
])
.
replace
(
'
\n
'
,
''
)
data
[
1
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
1
])
.
replace
(
'
\n
'
,
''
)
posts
.
append
(
data
[
0
]
+
data
[
1
])
# tfidf_vectorizer = TfidfVectorizer()
# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
keyword_extractor
=
KeywordSummarizer
(
# tokenize=Okt_tokenizer,
tokenize
=
komoran_tokenizer
,
window
=
-
1
,
verbose
=
False
)
keywords
=
keyword_extractor
.
summarize
(
posts
,
topk
=
30
)
print
(
keywords
)
\ No newline at end of file
def
GetKeywords
():
posts
=
[]
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'r'
,
encoding
=
'utf-8'
)
as
db
:
reader
=
csv
.
reader
(
db
)
for
data
in
reader
:
data
[
0
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
0
])
.
replace
(
'
\n
'
,
''
)
data
[
1
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
1
])
.
replace
(
'
\n
'
,
''
)
posts
.
append
(
data
[
0
]
+
data
[
1
])
keyword_extractor
=
KeywordSummarizer
(
tokenize
=
komoran_tokenizer
,
window
=
-
1
,
verbose
=
False
)
keywords
=
keyword_extractor
.
summarize
(
posts
,
topk
=
30
)
return
keywords
\ No newline at end of file
...
...
content.py
View file @
67add56
...
...
@@ -10,6 +10,8 @@ from selenium.common.exceptions import NoSuchElementException
from
bs4
import
BeautifulSoup
from
datetime
import
datetime
,
timedelta
BASE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
def
sleeptime
():
time
.
sleep
(
random
.
randint
(
1
,
3
))
...
...
@@ -18,83 +20,87 @@ def Click(xpath):
driver
.
execute_script
(
"arguments[0].click();"
,
element
)
sleeptime
()
login_info
=
{
'userID'
:
'id'
,
'userpw'
:
'********'
}
options
=
webdriver
.
ChromeOptions
()
# options.add_argument('headless')
options
.
add_argument
(
'no-sandbox'
)
options
.
add_argument
(
'window-size=1920x1080'
)
options
.
add_argument
(
'disable-gpu'
)
options
.
add_argument
(
'disable-dev-shm-usage'
)
options
.
add_argument
(
'lang=ko_KR'
)
options
.
add_argument
(
'user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47'
)
driver
=
webdriver
.
Chrome
(
r'C:\Users\E_N__\Desktop\chromedriver.exe'
,
options
=
options
)
driver
.
get
(
'about:blank'
)
driver
.
execute_script
(
"Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});"
)
sleeptime
()
driver
.
get
(
'https://everytime.kr/login'
)
sleeptime
()
driver
.
find_element_by_name
(
'userid'
)
.
send_keys
(
login_info
[
'userID'
])
driver
.
find_element_by_name
(
'password'
)
.
send_keys
(
login_info
[
'userpw'
])
driver
.
find_element_by_class_name
(
'submit'
)
.
click
()
sleeptime
()
# 국제캠 자게
sleeptime
()
yesterday
=
(
datetime
.
today
()
-
timedelta
(
1
))
.
strftime
(
'
%
m/
%
d
%
H:
%
M'
)
print
(
yesterday
)
swt
=
True
page
=
0
post_df
=
pd
.
DataFrame
(
columns
=
[
'title'
,
'content'
])
while
swt
:
if
page
<
1
:
Click
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
page
+=
1
else
:
if
page
==
1
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a'
)
page
+=
1
elif
page
==
2
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
def
GetData
():
login_info
=
{
'userID'
:
'qdw0313'
,
'userpw'
:
'fejUfrQxHWwtcGcP0'
}
options
=
webdriver
.
ChromeOptions
()
options
.
add_argument
(
'headless'
)
options
.
add_argument
(
'no-sandbox'
)
options
.
add_argument
(
'window-size=1920x1080'
)
options
.
add_argument
(
'disable-gpu'
)
options
.
add_argument
(
'disable-dev-shm-usage'
)
options
.
add_argument
(
'lang=ko_KR'
)
options
.
add_argument
(
'user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47'
)
# driver = webdriver.Chrome(r'C:\Users\E_N__\Desktop\chromedriver.exe', options=options)
driver
=
webdriver
.
Chrome
(
BASE_DIR
+
'/chromedriver.exe'
,
options
=
options
)
driver
.
get
(
'about:blank'
)
driver
.
execute_script
(
"Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});"
)
sleeptime
()
driver
.
get
(
'https://everytime.kr/login'
)
sleeptime
()
driver
.
find_element_by_name
(
'userid'
)
.
send_keys
(
login_info
[
'userID'
])
driver
.
find_element_by_name
(
'password'
)
.
send_keys
(
login_info
[
'userpw'
])
driver
.
find_element_by_class_name
(
'submit'
)
.
click
()
sleeptime
()
# 국제캠 자게
sleeptime
()
yesterday
=
(
datetime
.
today
()
-
timedelta
(
1
))
.
strftime
(
'
%
m/
%
d
%
H:
%
M'
)
print
(
yesterday
)
swt
=
True
page
=
0
post_df
=
pd
.
DataFrame
(
columns
=
[
'title'
,
'content'
])
while
swt
:
if
page
<
1
:
Click
(
'//*[@id="submenu"]/div/div[2]/ul/li[1]/a'
)
page
+=
1
else
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
html
=
driver
.
page_source
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
TitleList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > h2'
)
DateList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > time'
)
for
post
in
zip
(
TitleList
,
DateList
):
title
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
post
[
0
]
.
text
)
try
:
Click
(
"//h2[contains(text(), '{}')]"
.
format
(
title
))
except
NoSuchElementException
:
continue
content
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/article/a/p'
)
.
text
driver
.
back
()
sleeptime
()
if
not
(
post_df
[
'title'
]
==
title
)
.
any
():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
content
)
post_df
=
post_df
.
append
(
pd
.
DataFrame
([[
title
,
content
]],
columns
=
[
'title'
,
'content'
]))
# print("{0}. {1} : {2}".format(idx, title, content))
print
(
post
[
1
]
.
text
)
if
post
[
1
]
.
text
<
yesterday
:
break
post_df
.
to_csv
(
'data.csv'
,
mode
=
'w'
,
encoding
=
'utf-8-sig'
)
print
(
"CVS file saved"
)
post_df
.
to_json
(
'data.json'
,
orient
=
'records'
,
encoding
=
'utf-8-sig'
)
print
(
"JSON file saved"
)
\ No newline at end of file
if
page
==
1
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a'
)
page
+=
1
elif
page
==
2
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a[2]'
)
page
+=
1
else
:
Click
(
'//*[@id="container"]/div[2]/div[2]/a[3]'
)
html
=
driver
.
page_source
soup
=
BeautifulSoup
(
html
,
'html.parser'
)
TitleList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > h2'
)
DateList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > time'
)
for
post
in
zip
(
TitleList
,
DateList
):
title
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
post
[
0
]
.
text
)
try
:
Click
(
"//h2[contains(text(), '{}')]"
.
format
(
title
))
except
NoSuchElementException
:
continue
content
=
driver
.
find_element_by_xpath
(
'//*[@id="container"]/div[2]/article/a/p'
)
.
text
driver
.
back
()
sleeptime
()
if
not
(
post_df
[
'title'
]
==
title
)
.
any
():
# Click('//*[@id="container"]/div[2]/article[{}]'.format(idx))
content
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
content
)
post_df
=
post_df
.
append
(
pd
.
DataFrame
([[
title
,
content
]],
columns
=
[
'title'
,
'content'
]))
# print("{0}. {1} : {2}".format(idx, title, content))
print
(
post
[
1
]
.
text
)
print
(
yesterday
<
"06/02 16:35"
)
exit
()
if
post
[
1
]
.
text
<=
yesterday
:
break
post_df
.
to_csv
(
'data.csv'
,
mode
=
'w'
,
encoding
=
'utf-8-sig'
,
index
=
False
)
print
(
"CVS file saved"
)
with
open
(
'data.json'
,
'w'
,
encoding
=
'utf-8'
)
as
file
:
post_df
.
to_json
(
file
,
force_ascii
=
False
)
print
(
"JSON file saved"
)
\ No newline at end of file
...
...
Please
register
or
login
to post a comment