Showing
1 changed file
with
101 additions
and
0 deletions
src/textrank/word.py
0 → 100644
1 | +from collections import defaultdict | ||
2 | +from scipy.sparse import csr_matrix | ||
3 | + | ||
4 | +from .utils import scan_vocabulary | ||
5 | +from .utils import tokenize_sents | ||
6 | + | ||
7 | + | ||
8 | +def word_graph(sents, tokenize=None, min_count=2, window=2, | ||
9 | + min_cooccurrence=2, vocab_to_idx=None, verbose=False): | ||
10 | + """ | ||
11 | + Arguments | ||
12 | + --------- | ||
13 | + sents : list of str | ||
14 | + Sentence list | ||
15 | + tokenize : callable | ||
16 | + tokenize(str) returns list of str | ||
17 | + min_count : int | ||
18 | + Minumum term frequency | ||
19 | + window : int | ||
20 | + Co-occurrence window size | ||
21 | + min_cooccurrence : int | ||
22 | + Minimum cooccurrence frequency | ||
23 | + vocab_to_idx : dict | ||
24 | + Vocabulary to index mapper. | ||
25 | + If None, this function scan vocabulary first. | ||
26 | + verbose : Boolean | ||
27 | + If True, verbose mode on | ||
28 | + Returns | ||
29 | + ------- | ||
30 | + co-occurrence word graph : scipy.sparse.csr_matrix | ||
31 | + idx_to_vocab : list of str | ||
32 | + Word list corresponding row and column | ||
33 | + """ | ||
34 | + if vocab_to_idx is None: | ||
35 | + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) | ||
36 | + else: | ||
37 | + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] | ||
38 | + | ||
39 | + tokens = tokenize_sents(sents, tokenize) | ||
40 | + g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose) | ||
41 | + return g, idx_to_vocab | ||
42 | + | ||
43 | +def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False): | ||
44 | + """ | ||
45 | + Arguments | ||
46 | + --------- | ||
47 | + tokens : list of list of str | ||
48 | + Tokenized sentence list | ||
49 | + vocab_to_idx : dict | ||
50 | + Vocabulary to index mapper | ||
51 | + window : int | ||
52 | + Co-occurrence window size | ||
53 | + min_cooccurrence : int | ||
54 | + Minimum cooccurrence frequency | ||
55 | + verbose : Boolean | ||
56 | + If True, verbose mode on | ||
57 | + Returns | ||
58 | + ------- | ||
59 | + co-occurrence matrix : scipy.sparse.csr_matrix | ||
60 | + shape = (n_vocabs, n_vocabs) | ||
61 | + """ | ||
62 | + counter = defaultdict(int) | ||
63 | + for s, tokens_i in enumerate(tokens): | ||
64 | + if verbose and s % 1000 == 0: | ||
65 | + print('\rword cooccurrence counting {}'.format(s), end='') | ||
66 | + vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx] | ||
67 | + n = len(vocabs) | ||
68 | + for i, v in enumerate(vocabs): | ||
69 | + if window <= 0: | ||
70 | + b, e = 0, n | ||
71 | + else: | ||
72 | + b = max(0, i - window) | ||
73 | + e = min(i + window, n) | ||
74 | + for j in range(b, e): | ||
75 | + if i == j: | ||
76 | + continue | ||
77 | + counter[(v, vocabs[j])] += 1 | ||
78 | + counter[(vocabs[j], v)] += 1 | ||
79 | + counter = {k:v for k,v in counter.items() if v >= min_cooccurrence} | ||
80 | + n_vocabs = len(vocab_to_idx) | ||
81 | + if verbose: | ||
82 | + print('\rword cooccurrence counting from {} sents was done'.format(s+1)) | ||
83 | + return dict_to_mat(counter, n_vocabs, n_vocabs) | ||
84 | + | ||
85 | +def dict_to_mat(d, n_rows, n_cols): | ||
86 | + """ | ||
87 | + Arguments | ||
88 | + --------- | ||
89 | + d : dict | ||
90 | + key : (i,j) tuple | ||
91 | + value : float value | ||
92 | + Returns | ||
93 | + ------- | ||
94 | + scipy.sparse.csr_matrix | ||
95 | + """ | ||
96 | + rows, cols, data = [], [], [] | ||
97 | + for (i, j), v in d.items(): | ||
98 | + rows.append(i) | ||
99 | + cols.append(j) | ||
100 | + data.append(v) | ||
101 | + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment