Toggle navigation
Toggle navigation
This project
Loading...
Sign in
박은주
/
Todays_Issue
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
박은주
2021-06-02 23:46:53 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
fc372ad628e1738d9d78e77b0367775a099b6f06
fc372ad6
1 parent
b17f3f76
TextRank를 이용하여 메인 토픽(최대 Top 30) 추출
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
41 additions
and
10 deletions
.gitignore
GetTopic.py
.gitignore
View file @
fc372ad
...
...
@@ -13,4 +13,8 @@ chromedriver.exe
/.idea/
*.iml
*.csv
*.xml
\ No newline at end of file
*.xml
/textrank/
/textrank.egg-info/
/build/
/dist/
\ No newline at end of file
...
...
GetTopic.py
View file @
fc372ad
import
os
import
csv
import
re
from
sklearn.metrics.pairwise
import
cosine_similarity
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.cluster
import
KMeans
from
konlpy.tag
import
Okt
from
konlpy.tag
import
Komoran
from
textrank
import
KeywordSummarizer
okt
=
Okt
()
def
DocToNouns
(
docs
):
return
[{
'id'
:
i
,
'nouns'
:
' '
.
join
(
okt
.
nouns
(
doc
)),
}
for
i
,
doc
in
enumerate
(
docs
)]
def
Okt_tokenizer
(
sent
):
words
=
okt
.
nouns
(
sent
)
# words = okt.pos(sent, join=True, stem=True)
# words = [w for w in words if('/Noun' in w or '/Verb' in w or '/Adjective' in w)]
return
words
komoran
=
Komoran
()
def
komoran_tokenizer
(
sent
):
# words = []
# for sentence in sent:
# words += komoran.pos(sentence, join=True)
# print("check : ", komoran.pos(sentence, join=True))
# words = [komoran.pos(sentence, join=True) for sentence in sent]
words
=
komoran
.
pos
(
sent
,
join
=
True
)
words
=
[
w
for
w
in
words
if
(
'/NNG'
in
w
or
'/NNP'
in
w
)]
return
words
BASE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
posts
=
[]
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'r'
,
encoding
=
'utf-8
-sig
'
)
as
db
:
with
open
(
os
.
path
.
join
(
BASE_DIR
+
'/'
,
'data.csv'
),
'r'
,
encoding
=
'utf-8'
)
as
db
:
reader
=
csv
.
reader
(
db
)
for
data
in
reader
:
posts
.
append
(
data
)
data
[
0
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
0
])
.
replace
(
'
\n
'
,
''
)
data
[
1
]
=
re
.
sub
(
pattern
=
'[^
\
w
\
s]'
,
repl
=
''
,
string
=
data
[
1
])
.
replace
(
'
\n
'
,
''
)
posts
.
append
(
data
[
0
]
+
data
[
1
])
# tfidf_vectorizer = TfidfVectorizer()
# title_vector = tfidf_vectorizer.fit_transform(noun['title'] for noun in nouns)
# content_vector = tfidf_vectorizer.fit_transform(noun['content'] for noun in nouns)
keyword_extractor
=
KeywordSummarizer
(
# tokenize=Okt_tokenizer,
tokenize
=
komoran_tokenizer
,
window
=
-
1
,
verbose
=
False
)
nouns
=
DocToNouns
(
posts
)
print
(
nouns
)
\ No newline at end of file
keywords
=
keyword_extractor
.
summarize
(
posts
,
topk
=
30
)
print
(
keywords
)
\ No newline at end of file
...
...
Please
register
or
login
to post a comment