GyuhoLee

[Add] 단어 관련 함수들(그래프화, 중요도)

1 +from collections import defaultdict
2 +from scipy.sparse import csr_matrix
3 +
4 +from .utils import scan_vocabulary
5 +from .utils import tokenize_sents
6 +
7 +
8 +def word_graph(sents, tokenize=None, min_count=2, window=2,
9 + min_cooccurrence=2, vocab_to_idx=None, verbose=False):
10 + """
11 + Arguments
12 + ---------
13 + sents : list of str
14 + Sentence list
15 + tokenize : callable
16 + tokenize(str) returns list of str
17 + min_count : int
18 + Minumum term frequency
19 + window : int
20 + Co-occurrence window size
21 + min_cooccurrence : int
22 + Minimum cooccurrence frequency
23 + vocab_to_idx : dict
24 + Vocabulary to index mapper.
25 + If None, this function scan vocabulary first.
26 + verbose : Boolean
27 + If True, verbose mode on
28 + Returns
29 + -------
30 + co-occurrence word graph : scipy.sparse.csr_matrix
31 + idx_to_vocab : list of str
32 + Word list corresponding row and column
33 + """
34 + if vocab_to_idx is None:
35 + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
36 + else:
37 + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
38 +
39 + tokens = tokenize_sents(sents, tokenize)
40 + g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose)
41 + return g, idx_to_vocab
42 +
43 +def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False):
44 + """
45 + Arguments
46 + ---------
47 + tokens : list of list of str
48 + Tokenized sentence list
49 + vocab_to_idx : dict
50 + Vocabulary to index mapper
51 + window : int
52 + Co-occurrence window size
53 + min_cooccurrence : int
54 + Minimum cooccurrence frequency
55 + verbose : Boolean
56 + If True, verbose mode on
57 + Returns
58 + -------
59 + co-occurrence matrix : scipy.sparse.csr_matrix
60 + shape = (n_vocabs, n_vocabs)
61 + """
62 + counter = defaultdict(int)
63 + for s, tokens_i in enumerate(tokens):
64 + if verbose and s % 1000 == 0:
65 + print('\rword cooccurrence counting {}'.format(s), end='')
66 + vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
67 + n = len(vocabs)
68 + for i, v in enumerate(vocabs):
69 + if window <= 0:
70 + b, e = 0, n
71 + else:
72 + b = max(0, i - window)
73 + e = min(i + window, n)
74 + for j in range(b, e):
75 + if i == j:
76 + continue
77 + counter[(v, vocabs[j])] += 1
78 + counter[(vocabs[j], v)] += 1
79 + counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
80 + n_vocabs = len(vocab_to_idx)
81 + if verbose:
82 + print('\rword cooccurrence counting from {} sents was done'.format(s+1))
83 + return dict_to_mat(counter, n_vocabs, n_vocabs)
84 +
85 +def dict_to_mat(d, n_rows, n_cols):
86 + """
87 + Arguments
88 + ---------
89 + d : dict
90 + key : (i,j) tuple
91 + value : float value
92 + Returns
93 + -------
94 + scipy.sparse.csr_matrix
95 + """
96 + rows, cols, data = [], [], []
97 + for (i, j), v in d.items():
98 + rows.append(i)
99 + cols.append(j)
100 + data.append(v)
101 + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
...\ No newline at end of file ...\ No newline at end of file