Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-03 14:03:54 +0900
Browse Files
Options
Browse Files
Download
Plain Diff
Commit
489e0a3e184e2b2797600f3a3a2bb8efc577f466
489e0a3e
2 parents
0bf5691f
659b953b
수정: 전문 크롤링
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
52 additions
and
19 deletions
.gitignore
GetTopic.py
README.md
content.py
.gitignore
View file @
489e0a3
...
...
@@ -20,3 +20,7 @@ chromedriver.exe
/KoreanSentimentAnalyzer/.gitignore
/KoreanSentimentAnalyzer/.git/
/textrank/
/textrank.egg-info/
/build/
/dist/
...
...
GetTopic.py
View file @
489e0a3
import
os
import
csv
import
re
from
sklearn.metrics.pairwise
import
cosine_similarity
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.cluster
import
KMeans
from
konlpy.tag
import
Okt
from
konlpy.tag
import
Komoran
from
textrank
import
KeywordSummarizer
okt
=
Okt
()
def
DocToNouns
(
docs
):
return
[{
'id'
:
i
,
'nouns'
:
' '
.
join
(
okt
.
nouns
(
doc
)),
}
for
i
,
doc
in
enumerate
(
docs
)]
def
Okt_tokenizer
(
sent
):
words
=
okt
.
nouns
(
sent
)
# words = okt.pos(sent, join=True, stem=True)
# words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
return
words
komoran
=
Komoran
()
def
komoran_tokenizer
(
sent
):
# words = []
# for sentence in sent:
# words += komoran.pos(sentence, join=True)
# print("check : ", komoran.pos(sentence, join=True))
# words = [komoran.pos(sentence, join=True) for sentence in sent]
words
=
komoran
.
pos
(
sent
,
join
=
True
)
words
=
[
w
for
w
in
words
if
(
'/NNG'
in
w
or
'/NNP'
in
w
)]
return
words
BASE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
posts
=
[]
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'r'
,
encoding
=
'utf-8
-sig
'
)
as
db
:
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'r'
,
encoding
=
'utf-8'
)
as
db
:
reader
=
csv
.
reader
(
db
)
for
data
in
reader
:
posts
.
append
(
data
)
data
[
0
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
0
])
.
replace
(
'
\n
'
,
''
)
data
[
1
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
1
])
.
replace
(
'
\n
'
,
''
)
posts
.
append
(
data
[
0
]
+
data
[
1
])
# tfidf_vectorizer = TfidfVectorizer()
# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
keyword_extractor
=
KeywordSummarizer
(
# tokenize=Okt_tokenizer,
tokenize
=
komoran_tokenizer
,
window
=
-
1
,
verbose
=
False
)
nouns
=
DocToNouns
(
posts
)
print
(
nouns
)
\ No newline at end of file
keywords
=
keyword_extractor
.
summarize
(
posts
,
topk
=
30
)
print
(
keywords
)
\ No newline at end of file
...
...
README.md
View file @
489e0a3
...
...
@@ -10,7 +10,12 @@
*
konlpy
*
jpype
## Package
> TextRank Package https://github.com/lovit/textrank/ <br>
> Sentiment Analyzer https://github.com/mrlee23/KoreanSentimentAnalyzer
## WORK
[
] everytime.kr(경희대)로부터 24시간 이내의 이슈를 정리 ; 주기능
[
] 질문 입력시 해당하는 게시글 검색 ; 챗봇
\ No newline at end of file
-
[
]
everytime.kr(경희대)로부터 24시간 이내의 이슈 토픽 정리 ; 주기능
<br>
-
[
]
질문 입력시 해당하는 게시글 검색
<br>
-
[
]
지난 24시간 이내의 게시판 분위기 정리
\ No newline at end of file
...
...
content.py
View file @
489e0a3
...
...
@@ -19,12 +19,13 @@ def Click(xpath):
sleeptime
()
login_info
=
{
'userID'
:
'qdw0313'
,
'userpw'
:
'fejUfrQxHWwtcGcP0'
'userID'
:
'id'
,
'userpw'
:
'********'
}
options
=
webdriver
.
ChromeOptions
()
options
.
add_argument
(
'headless'
)
#
options.add_argument('headless')
options
.
add_argument
(
'no-sandbox'
)
options
.
add_argument
(
'window-size=1920x1080'
)
options
.
add_argument
(
'disable-gpu'
)
...
...
@@ -32,7 +33,7 @@ options.add_argument('disable-dev-shm-usage')
options
.
add_argument
(
'lang=ko_KR'
)
options
.
add_argument
(
'user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Safari/537.36 Vivaldi/1.96.1147.47'
)
driver
=
webdriver
.
Chrome
(
r'C:\Users\
Admin\Desktop\OSS\Todays_Issue
\chromedriver.exe'
,
options
=
options
)
driver
=
webdriver
.
Chrome
(
r'C:\Users\
E_N__\Desktop
\chromedriver.exe'
,
options
=
options
)
driver
.
get
(
'about:blank'
)
driver
.
execute_script
(
"Object.defineProperty(navigator, 'plugins', {get: function() {return[1, 2, 3, 4, 5];},});"
)
...
...
@@ -72,10 +73,7 @@ while swt:
TitleList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > h2'
)
DateList
=
soup
.
select
(
'#container > div.wrap.articles > article > a > time'
)
# ContentList = soup.select('#container > div.wrap.articles > article > a > p')
# idx = 1
# for post in zip(TitleList, ContentList, DateList):
for
post
in
zip
(
TitleList
,
DateList
):
title
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
post
[
0
]
.
text
)
try
:
...
...
@@ -94,7 +92,6 @@ while swt:
# print("{0}. {1} : {2}".format(idx, title, content))
print
(
post
[
1
]
.
text
)
if
post
[
1
]
.
text
<
yesterday
:
swt
=
False
break
post_df
.
to_csv
(
'data.csv'
,
mode
=
'w'
,
encoding
=
'utf-8-sig'
)
...
...
Please
register
or
login
to post a comment