[Add] 단어 관련 함수들(그래프화, 중요도)

GyuhoLee
Commit 843289638d9cecd4f489eb0c9fdbe16337af78dd 84328963 1 parent ff34ecea
Showing 1 changed file with 101 additions and 0 deletions
src/textrank/word.py
--- a/src/textrank/word.py 0 → 100644
View file @8432896
+++ b/src/textrank/word.py 0 → 100644
View file @8432896
+ from collections import defaultdict
+ from scipy.sparse import csr_matrix
+ 
+ from .utils import scan_vocabulary
+ from .utils import tokenize_sents
+ 
+ 
+ def word_graph(sents, tokenize=None, min_count=2, window=2,
+     min_cooccurrence=2, vocab_to_idx=None, verbose=False):
+     """
+     Arguments
+     ---------
+     sents : list of str
+         Sentence list
+     tokenize : callable
+         tokenize(str) returns list of str
+     min_count : int
+         Minumum term frequency
+     window : int
+         Co-occurrence window size
+     min_cooccurrence : int
+         Minimum cooccurrence frequency
+     vocab_to_idx : dict
+         Vocabulary to index mapper.
+         If None, this function scan vocabulary first.
+     verbose : Boolean
+         If True, verbose mode on
+     Returns
+     -------
+     co-occurrence word graph : scipy.sparse.csr_matrix
+     idx_to_vocab : list of str
+         Word list corresponding row and column
+     """
+     if vocab_to_idx is None:
+         idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
+     else:
+         idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
+ 
+     tokens = tokenize_sents(sents, tokenize)
+     g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose)
+     return g, idx_to_vocab
+ 
+ def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False):
+     """
+     Arguments
+     ---------
+     tokens : list of list of str
+         Tokenized sentence list
+     vocab_to_idx : dict
+         Vocabulary to index mapper
+     window : int
+         Co-occurrence window size
+     min_cooccurrence : int
+         Minimum cooccurrence frequency
+     verbose : Boolean
+         If True, verbose mode on
+     Returns
+     -------
+     co-occurrence matrix : scipy.sparse.csr_matrix
+         shape = (n_vocabs, n_vocabs)
+     """
+     counter = defaultdict(int)
+     for s, tokens_i in enumerate(tokens):
+         if verbose and s % 1000 == 0:
+             print('\rword cooccurrence counting {}'.format(s), end='')
+         vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
+         n = len(vocabs)
+         for i, v in enumerate(vocabs):
+             if window <= 0:
+                 b, e = 0, n
+             else:
+                 b = max(0, i - window)
+                 e = min(i + window, n)
+             for j in range(b, e):
+                 if i == j:
+                     continue
+                 counter[(v, vocabs[j])] += 1
+                 counter[(vocabs[j], v)] += 1
+     counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
+     n_vocabs = len(vocab_to_idx)
+     if verbose:
+         print('\rword cooccurrence counting from {} sents was done'.format(s+1))
+     return dict_to_mat(counter, n_vocabs, n_vocabs)
+ 
+ def dict_to_mat(d, n_rows, n_cols):
+     """
+     Arguments
+     ---------
+     d : dict
+         key : (i,j) tuple
+         value : float value
+     Returns
+     -------
+     scipy.sparse.csr_matrix
+     """
+     rows, cols, data = [], [], []
+     for (i, j), v in d.items():
+         rows.append(i)
+         cols.append(j)
+         data.append(v)
+     return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
\ No newline at end of file