Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2020-2-capstone-design1
/
HCG_project
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
GyuhoLee
2020-11-08 21:08:11 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
843289638d9cecd4f489eb0c9fdbe16337af78dd
84328963
1 parent
ff34ecea
[Add] 단어 관련 함수들(그래프화, 중요도)
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
101 additions
and
0 deletions
src/textrank/word.py
src/textrank/word.py
0 → 100644
View file @
8432896
from
collections
import
defaultdict
from
scipy.sparse
import
csr_matrix
from
.utils
import
scan_vocabulary
from
.utils
import
tokenize_sents
def
word_graph
(
sents
,
tokenize
=
None
,
min_count
=
2
,
window
=
2
,
min_cooccurrence
=
2
,
vocab_to_idx
=
None
,
verbose
=
False
):
"""
Arguments
---------
sents : list of str
Sentence list
tokenize : callable
tokenize(str) returns list of str
min_count : int
Minumum term frequency
window : int
Co-occurrence window size
min_cooccurrence : int
Minimum cooccurrence frequency
vocab_to_idx : dict
Vocabulary to index mapper.
If None, this function scan vocabulary first.
verbose : Boolean
If True, verbose mode on
Returns
-------
co-occurrence word graph : scipy.sparse.csr_matrix
idx_to_vocab : list of str
Word list corresponding row and column
"""
if
vocab_to_idx
is
None
:
idx_to_vocab
,
vocab_to_idx
=
scan_vocabulary
(
sents
,
tokenize
,
min_count
)
else
:
idx_to_vocab
=
[
vocab
for
vocab
,
_
in
sorted
(
vocab_to_idx
.
items
(),
key
=
lambda
x
:
x
[
1
])]
tokens
=
tokenize_sents
(
sents
,
tokenize
)
g
=
cooccurrence
(
tokens
,
vocab_to_idx
,
window
,
min_cooccurrence
,
verbose
)
return
g
,
idx_to_vocab
def
cooccurrence
(
tokens
,
vocab_to_idx
,
window
=
2
,
min_cooccurrence
=
2
,
verbose
=
False
):
"""
Arguments
---------
tokens : list of list of str
Tokenized sentence list
vocab_to_idx : dict
Vocabulary to index mapper
window : int
Co-occurrence window size
min_cooccurrence : int
Minimum cooccurrence frequency
verbose : Boolean
If True, verbose mode on
Returns
-------
co-occurrence matrix : scipy.sparse.csr_matrix
shape = (n_vocabs, n_vocabs)
"""
counter
=
defaultdict
(
int
)
for
s
,
tokens_i
in
enumerate
(
tokens
):
if
verbose
and
s
%
1000
==
0
:
print
(
'
\r
word cooccurrence counting {}'
.
format
(
s
),
end
=
''
)
vocabs
=
[
vocab_to_idx
[
w
]
for
w
in
tokens_i
if
w
in
vocab_to_idx
]
n
=
len
(
vocabs
)
for
i
,
v
in
enumerate
(
vocabs
):
if
window
<=
0
:
b
,
e
=
0
,
n
else
:
b
=
max
(
0
,
i
-
window
)
e
=
min
(
i
+
window
,
n
)
for
j
in
range
(
b
,
e
):
if
i
==
j
:
continue
counter
[(
v
,
vocabs
[
j
])]
+=
1
counter
[(
vocabs
[
j
],
v
)]
+=
1
counter
=
{
k
:
v
for
k
,
v
in
counter
.
items
()
if
v
>=
min_cooccurrence
}
n_vocabs
=
len
(
vocab_to_idx
)
if
verbose
:
print
(
'
\r
word cooccurrence counting from {} sents was done'
.
format
(
s
+
1
))
return
dict_to_mat
(
counter
,
n_vocabs
,
n_vocabs
)
def
dict_to_mat
(
d
,
n_rows
,
n_cols
):
"""
Arguments
---------
d : dict
key : (i,j) tuple
value : float value
Returns
-------
scipy.sparse.csr_matrix
"""
rows
,
cols
,
data
=
[],
[],
[]
for
(
i
,
j
),
v
in
d
.
items
():
rows
.
append
(
i
)
cols
.
append
(
j
)
data
.
append
(
v
)
return
csr_matrix
((
data
,
(
rows
,
cols
)),
shape
=
(
n_rows
,
n_cols
))
\ No newline at end of file
Please
register
or
login
to post a comment