Showing
18 changed files
with
603 additions
and
0 deletions
src/textrank/__init__.py
0 → 100644
No preview for this file type
No preview for this file type
src/textrank/__pycache__/rank.cpython-37.pyc
0 → 100644
No preview for this file type
src/textrank/__pycache__/rank.cpython-38.pyc
0 → 100644
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
src/textrank/__pycache__/word.cpython-37.pyc
0 → 100644
No preview for this file type
src/textrank/__pycache__/word.cpython-38.pyc
0 → 100644
No preview for this file type
src/textrank/rank.py
0 → 100644
1 | +import numpy as np | ||
2 | +from sklearn.preprocessing import normalize | ||
3 | + | ||
4 | +def pagerank(x, df=0.85, max_iter=30, bias=None): | ||
5 | + """ | ||
6 | + Arguments | ||
7 | + --------- | ||
8 | + x : scipy.sparse.csr_matrix | ||
9 | + shape = (n vertex, n vertex) | ||
10 | + df : float | ||
11 | + Damping factor, 0 < df < 1 | ||
12 | + max_iter : int | ||
13 | + Maximum number of iteration | ||
14 | + bias : numpy.ndarray or None | ||
15 | + If None, equal bias | ||
16 | + Returns | ||
17 | + ------- | ||
18 | + R : numpy.ndarray | ||
19 | + PageRank vector. shape = (n vertex, 1) | ||
20 | + """ | ||
21 | + | ||
22 | + assert 0 < df < 1 | ||
23 | + | ||
24 | + # initialize | ||
25 | + A = normalize(x, axis=0, norm='l1') | ||
26 | + R = np.ones(A.shape[0]).reshape(-1,1) | ||
27 | + | ||
28 | + # check bias | ||
29 | + if bias is None: | ||
30 | + bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1) | ||
31 | + else: | ||
32 | + bias = bias.reshape(-1,1) | ||
33 | + bias = A.shape[0] * bias / bias.sum() | ||
34 | + assert bias.shape[0] == A.shape[0] | ||
35 | + bias = (1 - df) * bias | ||
36 | + | ||
37 | + # iteration | ||
38 | + for _ in range(max_iter): | ||
39 | + R = df * (A * R) + bias | ||
40 | + | ||
41 | + return R | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
src/textrank/sentence.py
0 → 100644
1 | +from collections import Counter | ||
2 | +import math | ||
3 | +import numpy as np | ||
4 | +import scipy as sp | ||
5 | +from scipy.sparse import csr_matrix | ||
6 | +from sklearn.metrics import pairwise_distances | ||
7 | + | ||
8 | +from .utils import scan_vocabulary | ||
9 | +from .utils import tokenize_sents | ||
10 | + | ||
11 | + | ||
12 | +def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3, | ||
13 | + similarity=None, vocab_to_idx=None, verbose=False): | ||
14 | + """ | ||
15 | + Arguments | ||
16 | + --------- | ||
17 | + sents : list of str | ||
18 | + Sentence list | ||
19 | + tokenize : callable | ||
20 | + tokenize(sent) return list of str | ||
21 | + min_count : int | ||
22 | + Minimum term frequency | ||
23 | + min_sim : float | ||
24 | + Minimum similarity between sentences | ||
25 | + similarity : callable or str | ||
26 | + similarity(s1, s2) returns float | ||
27 | + s1 and s2 are list of str. | ||
28 | + available similarity = [callable, 'cosine', 'textrank'] | ||
29 | + vocab_to_idx : dict | ||
30 | + Vocabulary to index mapper. | ||
31 | + If None, this function scan vocabulary first. | ||
32 | + verbose : Boolean | ||
33 | + If True, verbose mode on | ||
34 | + Returns | ||
35 | + ------- | ||
36 | + sentence similarity graph : scipy.sparse.csr_matrix | ||
37 | + shape = (n sents, n sents) | ||
38 | + """ | ||
39 | + | ||
40 | + if vocab_to_idx is None: | ||
41 | + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) | ||
42 | + else: | ||
43 | + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] | ||
44 | + | ||
45 | + x = vectorize_sents(sents, tokenize, vocab_to_idx) | ||
46 | + if similarity == 'cosine': | ||
47 | + x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000) | ||
48 | + else: | ||
49 | + x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000) | ||
50 | + return x | ||
51 | + | ||
52 | +def vectorize_sents(sents, tokenize, vocab_to_idx): | ||
53 | + rows, cols, data = [], [], [] | ||
54 | + for i, sent in enumerate(sents): | ||
55 | + counter = Counter(tokenize(sent)) | ||
56 | + for token, count in counter.items(): | ||
57 | + j = vocab_to_idx.get(token, -1) | ||
58 | + if j == -1: | ||
59 | + continue | ||
60 | + rows.append(i) | ||
61 | + cols.append(j) | ||
62 | + data.append(count) | ||
63 | + n_rows = len(sents) | ||
64 | + n_cols = len(vocab_to_idx) | ||
65 | + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
66 | + | ||
67 | +def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000): | ||
68 | + n_rows = x.shape[0] | ||
69 | + mat = [] | ||
70 | + for bidx in range(math.ceil(n_rows / batch_size)): | ||
71 | + b = int(bidx * batch_size) | ||
72 | + e = min(n_rows, int((bidx+1) * batch_size)) | ||
73 | + psim = 1 - pairwise_distances(x[b:e], x, metric='cosine') | ||
74 | + rows, cols = np.where(psim >= min_sim) | ||
75 | + data = psim[rows, cols] | ||
76 | + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows))) | ||
77 | + if verbose: | ||
78 | + print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='') | ||
79 | + mat = sp.sparse.vstack(mat) | ||
80 | + if verbose: | ||
81 | + print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows)) | ||
82 | + return mat | ||
83 | + | ||
84 | +def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000): | ||
85 | + n_rows, n_cols = x.shape | ||
86 | + | ||
87 | + # Boolean matrix | ||
88 | + rows, cols = x.nonzero() | ||
89 | + data = np.ones(rows.shape[0]) | ||
90 | + z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
91 | + | ||
92 | + # Inverse sentence length | ||
93 | + size = np.asarray(x.sum(axis=1)).reshape(-1) | ||
94 | + size[np.where(size <= min_length)] = 10000 | ||
95 | + size = np.log(size) | ||
96 | + | ||
97 | + mat = [] | ||
98 | + for bidx in range(math.ceil(n_rows / batch_size)): | ||
99 | + | ||
100 | + # slicing | ||
101 | + b = int(bidx * batch_size) | ||
102 | + e = min(n_rows, int((bidx+1) * batch_size)) | ||
103 | + | ||
104 | + # dot product | ||
105 | + inner = z[b:e,:] * z.transpose() | ||
106 | + | ||
107 | + # sentence len[i,j] = size[i] + size[j] | ||
108 | + norm = size[b:e].reshape(-1,1) + size.reshape(1,-1) | ||
109 | + norm = norm ** (-1) | ||
110 | + norm[np.where(norm == np.inf)] = 0 | ||
111 | + | ||
112 | + # normalize | ||
113 | + sim = inner.multiply(norm).tocsr() | ||
114 | + rows, cols = (sim >= min_sim).nonzero() | ||
115 | + data = np.asarray(sim[rows, cols]).reshape(-1) | ||
116 | + | ||
117 | + # append | ||
118 | + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows))) | ||
119 | + | ||
120 | + if verbose: | ||
121 | + print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='') | ||
122 | + | ||
123 | + mat = sp.sparse.vstack(mat) | ||
124 | + if verbose: | ||
125 | + print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows)) | ||
126 | + | ||
127 | + return mat | ||
128 | + | ||
129 | +def graph_with_python_sim(tokens, verbose, similarity, min_sim): | ||
130 | + if similarity == 'cosine': | ||
131 | + similarity = cosine_sent_sim | ||
132 | + elif callable(similarity): | ||
133 | + similarity = similarity | ||
134 | + else: | ||
135 | + similarity = textrank_sent_sim | ||
136 | + | ||
137 | + rows, cols, data = [], [], [] | ||
138 | + n_sents = len(tokens) | ||
139 | + for i, tokens_i in enumerate(tokens): | ||
140 | + if verbose and i % 1000 == 0: | ||
141 | + print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='') | ||
142 | + for j, tokens_j in enumerate(tokens): | ||
143 | + if i >= j: | ||
144 | + continue | ||
145 | + sim = similarity(tokens_i, tokens_j) | ||
146 | + if sim < min_sim: | ||
147 | + continue | ||
148 | + rows.append(i) | ||
149 | + cols.append(j) | ||
150 | + data.append(sim) | ||
151 | + if verbose: | ||
152 | + print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents)) | ||
153 | + return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents)) | ||
154 | + | ||
155 | +def textrank_sent_sim(s1, s2): | ||
156 | + """ | ||
157 | + Arguments | ||
158 | + --------- | ||
159 | + s1, s2 : list of str | ||
160 | + Tokenized sentences | ||
161 | + Returns | ||
162 | + ------- | ||
163 | + Sentence similarity : float | ||
164 | + Non-negative number | ||
165 | + """ | ||
166 | + n1 = len(s1) | ||
167 | + n2 = len(s2) | ||
168 | + if (n1 <= 1) or (n2 <= 1): | ||
169 | + return 0 | ||
170 | + common = len(set(s1).intersection(set(s2))) | ||
171 | + base = math.log(n1) + math.log(n2) | ||
172 | + return common / base | ||
173 | + | ||
174 | +def cosine_sent_sim(s1, s2): | ||
175 | + """ | ||
176 | + Arguments | ||
177 | + --------- | ||
178 | + s1, s2 : list of str | ||
179 | + Tokenized sentences | ||
180 | + Returns | ||
181 | + ------- | ||
182 | + Sentence similarity : float | ||
183 | + Non-negative number | ||
184 | + """ | ||
185 | + if (not s1) or (not s2): | ||
186 | + return 0 | ||
187 | + | ||
188 | + s1 = Counter(s1) | ||
189 | + s2 = Counter(s2) | ||
190 | + norm1 = math.sqrt(sum(v ** 2 for v in s1.values())) | ||
191 | + norm2 = math.sqrt(sum(v ** 2 for v in s2.values())) | ||
192 | + prod = 0 | ||
193 | + for k, v in s1.items(): | ||
194 | + prod += v * s2.get(k, 0) | ||
195 | + return prod / (norm1 * norm2) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
src/textrank/summarizer.py
0 → 100644
1 | +import numpy as np | ||
2 | +from .rank import pagerank | ||
3 | +from .sentence import sent_graph | ||
4 | +from .word import word_graph | ||
5 | + | ||
6 | + | ||
7 | +class KeywordSummarizer: | ||
8 | + """ | ||
9 | + Arguments | ||
10 | + --------- | ||
11 | + sents : list of str | ||
12 | + Sentence list | ||
13 | + tokenize : callable | ||
14 | + Tokenize function: tokenize(str) = list of str | ||
15 | + min_count : int | ||
16 | + Minumum frequency of words will be used to construct sentence graph | ||
17 | + window : int | ||
18 | + Word cooccurrence window size. Default is -1. | ||
19 | + '-1' means there is cooccurrence between two words if the words occur in a sentence | ||
20 | + min_cooccurrence : int | ||
21 | + Minimum cooccurrence frequency of two words | ||
22 | + vocab_to_idx : dict or None | ||
23 | + Vocabulary to index mapper | ||
24 | + df : float | ||
25 | + PageRank damping factor | ||
26 | + max_iter : int | ||
27 | + Number of PageRank iterations | ||
28 | + verbose : Boolean | ||
29 | + If True, it shows training progress | ||
30 | + """ | ||
31 | + def __init__(self, sents=None, tokenize=None, min_count=2, | ||
32 | + window=-1, min_cooccurrence=2, vocab_to_idx=None, | ||
33 | + df=0.85, max_iter=30, verbose=False): | ||
34 | + | ||
35 | + self.tokenize = tokenize | ||
36 | + self.min_count = min_count | ||
37 | + self.window = window | ||
38 | + self.min_cooccurrence = min_cooccurrence | ||
39 | + self.vocab_to_idx = vocab_to_idx | ||
40 | + self.df = df | ||
41 | + self.max_iter = max_iter | ||
42 | + self.verbose = verbose | ||
43 | + | ||
44 | + if sents is not None: | ||
45 | + self.train_textrank(sents) | ||
46 | + | ||
47 | + def train_textrank(self, sents, bias=None): | ||
48 | + """ | ||
49 | + Arguments | ||
50 | + --------- | ||
51 | + sents : list of str | ||
52 | + Sentence list | ||
53 | + bias : None or numpy.ndarray | ||
54 | + PageRank bias term | ||
55 | + Returns | ||
56 | + ------- | ||
57 | + None | ||
58 | + """ | ||
59 | + | ||
60 | + g, self.idx_to_vocab = word_graph(sents, | ||
61 | + self.tokenize, self.min_count,self.window, | ||
62 | + self.min_cooccurrence, self.vocab_to_idx, self.verbose) | ||
63 | + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) | ||
64 | + if self.verbose: | ||
65 | + print('trained TextRank. n words = {}'.format(self.R.shape[0])) | ||
66 | + | ||
67 | + def keywords(self, topk=30): | ||
68 | + """ | ||
69 | + Arguments | ||
70 | + --------- | ||
71 | + topk : int | ||
72 | + Number of keywords selected from TextRank | ||
73 | + Returns | ||
74 | + ------- | ||
75 | + keywords : list of tuple | ||
76 | + Each tuple stands for (word, rank) | ||
77 | + """ | ||
78 | + if not hasattr(self, 'R'): | ||
79 | + raise RuntimeError('Train textrank first or use summarize function') | ||
80 | + idxs = self.R.argsort()[-topk:] | ||
81 | + keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)] | ||
82 | + return keywords | ||
83 | + | ||
84 | + def summarize(self, sents, topk=30): | ||
85 | + """ | ||
86 | + Arguments | ||
87 | + --------- | ||
88 | + sents : list of str | ||
89 | + Sentence list | ||
90 | + topk : int | ||
91 | + Number of keywords selected from TextRank | ||
92 | + Returns | ||
93 | + ------- | ||
94 | + keywords : list of tuple | ||
95 | + Each tuple stands for (word, rank) | ||
96 | + """ | ||
97 | + self.train_textrank(sents) | ||
98 | + return self.keywords(topk) | ||
99 | + | ||
100 | + | ||
101 | +class KeysentenceSummarizer: | ||
102 | + """ | ||
103 | + Arguments | ||
104 | + --------- | ||
105 | + sents : list of str | ||
106 | + Sentence list | ||
107 | + tokenize : callable | ||
108 | + Tokenize function: tokenize(str) = list of str | ||
109 | + min_count : int | ||
110 | + Minumum frequency of words will be used to construct sentence graph | ||
111 | + min_sim : float | ||
112 | + Minimum similarity between sentences in sentence graph | ||
113 | + similarity : str | ||
114 | + available similarity = ['cosine', 'textrank'] | ||
115 | + vocab_to_idx : dict or None | ||
116 | + Vocabulary to index mapper | ||
117 | + df : float | ||
118 | + PageRank damping factor | ||
119 | + max_iter : int | ||
120 | + Number of PageRank iterations | ||
121 | + verbose : Boolean | ||
122 | + If True, it shows training progress | ||
123 | + """ | ||
124 | + def __init__(self, sents=None, tokenize=None, min_count=2, | ||
125 | + min_sim=0.3, similarity=None, vocab_to_idx=None, | ||
126 | + df=0.85, max_iter=30, verbose=False): | ||
127 | + | ||
128 | + self.tokenize = tokenize | ||
129 | + self.min_count = min_count | ||
130 | + self.min_sim = min_sim | ||
131 | + self.similarity = similarity | ||
132 | + self.vocab_to_idx = vocab_to_idx | ||
133 | + self.df = df | ||
134 | + self.max_iter = max_iter | ||
135 | + self.verbose = verbose | ||
136 | + | ||
137 | + if sents is not None: | ||
138 | + self.train_textrank(sents) | ||
139 | + | ||
140 | + def train_textrank(self, sents, bias=None): | ||
141 | + """ | ||
142 | + Arguments | ||
143 | + --------- | ||
144 | + sents : list of str | ||
145 | + Sentence list | ||
146 | + bias : None or numpy.ndarray | ||
147 | + PageRank bias term | ||
148 | + Shape must be (n_sents,) | ||
149 | + Returns | ||
150 | + ------- | ||
151 | + None | ||
152 | + """ | ||
153 | + g = sent_graph(sents, self.tokenize, self.min_count, | ||
154 | + self.min_sim, self.similarity, self.vocab_to_idx, self.verbose) | ||
155 | + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) | ||
156 | + if self.verbose: | ||
157 | + print('trained TextRank. n sentences = {}'.format(self.R.shape[0])) | ||
158 | + | ||
159 | + def summarize(self, sents, topk=30, bias=None): | ||
160 | + """ | ||
161 | + Arguments | ||
162 | + --------- | ||
163 | + sents : list of str | ||
164 | + Sentence list | ||
165 | + topk : int | ||
166 | + Number of key-sentences to be selected. | ||
167 | + bias : None or numpy.ndarray | ||
168 | + PageRank bias term | ||
169 | + Shape must be (n_sents,) | ||
170 | + Returns | ||
171 | + ------- | ||
172 | + keysents : list of tuple | ||
173 | + Each tuple stands for (sentence index, rank, sentence) | ||
174 | + Usage | ||
175 | + ----- | ||
176 | + >>> from textrank import KeysentenceSummarizer | ||
177 | + >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5) | ||
178 | + >>> keysents = summarizer.summarize(texts, topk=30) | ||
179 | + """ | ||
180 | + n_sents = len(sents) | ||
181 | + if isinstance(bias, np.ndarray): | ||
182 | + if bias.shape != (n_sents,): | ||
183 | + raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) | ||
184 | + elif bias is not None: | ||
185 | + raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) | ||
186 | + self.train_textrank(sents, bias) | ||
187 | + idxs = self.R.argsort()[-topk:] | ||
188 | + keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | ||
189 | + return keysents | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
src/textrank/utils.py
0 → 100644
1 | +from collections import Counter | ||
2 | +from scipy.sparse import csr_matrix | ||
3 | +import numpy as np | ||
4 | + | ||
5 | + | ||
6 | +def scan_vocabulary(sents, tokenize=None, min_count=2): | ||
7 | + """ | ||
8 | + Arguments | ||
9 | + --------- | ||
10 | + sents : list of str | ||
11 | + Sentence list | ||
12 | + tokenize : callable | ||
13 | + tokenize(str) returns list of str | ||
14 | + min_count : int | ||
15 | + Minumum term frequency | ||
16 | + Returns | ||
17 | + ------- | ||
18 | + idx_to_vocab : list of str | ||
19 | + Vocabulary list | ||
20 | + vocab_to_idx : dict | ||
21 | + Vocabulary to index mapper. | ||
22 | + """ | ||
23 | + counter = Counter(w for sent in sents for w in tokenize(sent)) | ||
24 | + counter = {w:c for w,c in counter.items() if c >= min_count} | ||
25 | + idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])] | ||
26 | + vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)} | ||
27 | + return idx_to_vocab, vocab_to_idx | ||
28 | + | ||
29 | +def tokenize_sents(sents, tokenize): | ||
30 | + """ | ||
31 | + Arguments | ||
32 | + --------- | ||
33 | + sents : list of str | ||
34 | + Sentence list | ||
35 | + tokenize : callable | ||
36 | + tokenize(sent) returns list of str (word sequence) | ||
37 | + Returns | ||
38 | + ------- | ||
39 | + tokenized sentence list : list of list of str | ||
40 | + """ | ||
41 | + if tokenize is not None: | ||
42 | + return [tokenize(sent) for sent in sents] | ||
43 | + else: | ||
44 | + return sents | ||
45 | + | ||
46 | +def vectorize(tokens, vocab_to_idx): | ||
47 | + """ | ||
48 | + Arguments | ||
49 | + --------- | ||
50 | + tokens : list of list of str | ||
51 | + Tokenzed sentence list | ||
52 | + vocab_to_idx : dict | ||
53 | + Vocabulary to index mapper | ||
54 | + Returns | ||
55 | + ------- | ||
56 | + sentence bow : scipy.sparse.csr_matrix | ||
57 | + shape = (n_sents, n_terms) | ||
58 | + """ | ||
59 | + rows, cols, data = [], [], [] | ||
60 | + for i, tokens_i in enumerate(tokens): | ||
61 | + for t, c in Counter(tokens_i).items(): | ||
62 | + j = vocab_to_idx.get(t, -1) | ||
63 | + if j == -1: | ||
64 | + continue | ||
65 | + rows.append(i) | ||
66 | + cols.append(j) | ||
67 | + data.append(c) | ||
68 | + n_sents = len(tokens) | ||
69 | + n_terms = len(vocab_to_idx) | ||
70 | + x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms)) | ||
71 | + return x | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
src/textrank/word.py
0 → 100644
1 | +from collections import defaultdict | ||
2 | +from scipy.sparse import csr_matrix | ||
3 | + | ||
4 | +from .utils import scan_vocabulary | ||
5 | +from .utils import tokenize_sents | ||
6 | + | ||
7 | + | ||
8 | +def word_graph(sents, tokenize=None, min_count=2, window=2, | ||
9 | + min_cooccurrence=2, vocab_to_idx=None, verbose=False): | ||
10 | + """ | ||
11 | + Arguments | ||
12 | + --------- | ||
13 | + sents : list of str | ||
14 | + Sentence list | ||
15 | + tokenize : callable | ||
16 | + tokenize(str) returns list of str | ||
17 | + min_count : int | ||
18 | + Minumum term frequency | ||
19 | + window : int | ||
20 | + Co-occurrence window size | ||
21 | + min_cooccurrence : int | ||
22 | + Minimum cooccurrence frequency | ||
23 | + vocab_to_idx : dict | ||
24 | + Vocabulary to index mapper. | ||
25 | + If None, this function scan vocabulary first. | ||
26 | + verbose : Boolean | ||
27 | + If True, verbose mode on | ||
28 | + Returns | ||
29 | + ------- | ||
30 | + co-occurrence word graph : scipy.sparse.csr_matrix | ||
31 | + idx_to_vocab : list of str | ||
32 | + Word list corresponding row and column | ||
33 | + """ | ||
34 | + if vocab_to_idx is None: | ||
35 | + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count) | ||
36 | + else: | ||
37 | + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])] | ||
38 | + | ||
39 | + tokens = tokenize_sents(sents, tokenize) | ||
40 | + g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose) | ||
41 | + return g, idx_to_vocab | ||
42 | + | ||
43 | +def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False): | ||
44 | + """ | ||
45 | + Arguments | ||
46 | + --------- | ||
47 | + tokens : list of list of str | ||
48 | + Tokenized sentence list | ||
49 | + vocab_to_idx : dict | ||
50 | + Vocabulary to index mapper | ||
51 | + window : int | ||
52 | + Co-occurrence window size | ||
53 | + min_cooccurrence : int | ||
54 | + Minimum cooccurrence frequency | ||
55 | + verbose : Boolean | ||
56 | + If True, verbose mode on | ||
57 | + Returns | ||
58 | + ------- | ||
59 | + co-occurrence matrix : scipy.sparse.csr_matrix | ||
60 | + shape = (n_vocabs, n_vocabs) | ||
61 | + """ | ||
62 | + counter = defaultdict(int) | ||
63 | + for s, tokens_i in enumerate(tokens): | ||
64 | + if verbose and s % 1000 == 0: | ||
65 | + print('\rword cooccurrence counting {}'.format(s), end='') | ||
66 | + vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx] | ||
67 | + n = len(vocabs) | ||
68 | + for i, v in enumerate(vocabs): | ||
69 | + if window <= 0: | ||
70 | + b, e = 0, n | ||
71 | + else: | ||
72 | + b = max(0, i - window) | ||
73 | + e = min(i + window, n) | ||
74 | + for j in range(b, e): | ||
75 | + if i == j: | ||
76 | + continue | ||
77 | + counter[(v, vocabs[j])] += 1 | ||
78 | + counter[(vocabs[j], v)] += 1 | ||
79 | + counter = {k:v for k,v in counter.items() if v >= min_cooccurrence} | ||
80 | + n_vocabs = len(vocab_to_idx) | ||
81 | + if verbose: | ||
82 | + print('\rword cooccurrence counting from {} sents was done'.format(s+1)) | ||
83 | + return dict_to_mat(counter, n_vocabs, n_vocabs) | ||
84 | + | ||
85 | +def dict_to_mat(d, n_rows, n_cols): | ||
86 | + """ | ||
87 | + Arguments | ||
88 | + --------- | ||
89 | + d : dict | ||
90 | + key : (i,j) tuple | ||
91 | + value : float value | ||
92 | + Returns | ||
93 | + ------- | ||
94 | + scipy.sparse.csr_matrix | ||
95 | + """ | ||
96 | + rows, cols, data = [], [], [] | ||
97 | + for (i, j), v in d.items(): | ||
98 | + rows.append(i) | ||
99 | + cols.append(j) | ||
100 | + data.append(v) | ||
101 | + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols)) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment