Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2021-1-capstone-design2
/
2015104194
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
GyuhoLee
2021-05-25 19:15:57 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
0c3fc8179d9937f4dae76620cf44a2e7c4b715e7
0c3fc817
1 parent
a95a62fb
[Add] data preprocessing code
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
46 additions
and
0 deletions
src/pickle_to_csv.py
src/pickle_to_csv.py
0 → 100644
View file @
0c3fc81
import
re
,
csv
,
pickle
from
song
import
*
from
PyKomoran
import
*
from
textrank
import
KeywordSummarizer
def
komoran_tokenize
(
sent
):
words
=
sent
.
split
()
words
=
[
w
for
w
in
words
if
(
'/NNP'
in
w
or
'/NNG'
in
w
or
'/SL'
in
w
)]
return
words
data
=
[]
for
filename
in
range
(
1112
,
2122
,
202
):
with
open
(
str
(
filename
)
+
'.pickle'
,
'rb'
)
as
f
:
tmp
=
pickle
.
load
(
f
)
data
.
extend
(
tmp
)
f
=
open
(
'data.csv'
,
'w'
,
newline
=
''
,
encoding
=
'UTF-8'
)
wr
=
csv
.
writer
(
f
)
komoran
=
Komoran
(
'STABLE'
)
for
i
in
range
(
len
(
data
)):
# 제목 정제
idx
=
data
[
i
]
.
title
.
find
(
'('
)
if
idx
!=
-
1
:
data
[
i
]
.
title
=
data
[
i
]
.
title
[:
idx
]
# 가사 정제
if
data
[
i
]
.
lyrics
!=
''
and
data
[
i
]
.
title
!=
'거꾸로 걷는다'
:
texts
=
data
[
i
]
.
lyrics
.
split
(
'
\n
'
)
sents
=
[]
for
text
in
texts
:
tokened_text
=
komoran
.
get_plain_text
(
text
)
if
tokened_text
!=
''
:
sents
.
append
(
tokened_text
)
keyword_extractor
=
KeywordSummarizer
(
tokenize
=
komoran_tokenize
,
window
=
-
1
,
verbose
=
False
)
if
len
(
sents
)
!=
0
:
keywords
=
keyword_extractor
.
summarize
(
sents
,
topk
=
5
)
data
[
i
]
.
keywords
=
list
(
map
(
lambda
x
:
x
[
0
][:
x
[
0
]
.
find
(
'/'
)],
keywords
))
wr
.
writerow
(
data
[
i
]
.
getRow
())
data
[
i
]
.
saveImg
()
f
.
close
()
\ No newline at end of file
Please
register
or
login
to post a comment