Showing
1 changed file
with
68 additions
and
0 deletions
src/textrank/utils.py
0 → 100644
1 | +from collections import Counter | ||
2 | +from scipy.sparse import csr_matrix | ||
3 | +import numpy as np | ||
4 | + | ||
5 | + | ||
6 | +def scan_vocabulary(sents, tokenize=None, min_count=2): | ||
7 | + """ | ||
8 | + Arguments | ||
9 | + --------- | ||
10 | + sents : list of str | ||
11 | + Sentence list | ||
12 | + tokenize : callable | ||
13 | + tokenize(str) returns list of str | ||
14 | + min_count : int | ||
15 | + Minumum term frequency | ||
16 | + Returns | ||
17 | + ------- | ||
18 | + idx_to_vocab : list of str | ||
19 | + Vocabulary list | ||
20 | + vocab_to_idx : dict | ||
21 | + Vocabulary to index mapper. | ||
22 | + """ | ||
23 | + counter = Counter(w for sent in sents for w in tokenize(sent)) | ||
24 | + counter = {w:c for w,c in counter.items() if c >= min_count} | ||
25 | + idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])] | ||
26 | + vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)} | ||
27 | + return idx_to_vocab, vocab_to_idx | ||
28 | + | ||
29 | +def tokenize_sents(sents, tokenize): | ||
30 | + """ | ||
31 | + Arguments | ||
32 | + --------- | ||
33 | + sents : list of str | ||
34 | + Sentence list | ||
35 | + tokenize : callable | ||
36 | + tokenize(sent) returns list of str (word sequence) | ||
37 | + Returns | ||
38 | + ------- | ||
39 | + tokenized sentence list : list of list of str | ||
40 | + """ | ||
41 | + return [tokenize(sent) for sent in sents] | ||
42 | + | ||
43 | +def vectorize(tokens, vocab_to_idx): | ||
44 | + """ | ||
45 | + Arguments | ||
46 | + --------- | ||
47 | + tokens : list of list of str | ||
48 | + Tokenzed sentence list | ||
49 | + vocab_to_idx : dict | ||
50 | + Vocabulary to index mapper | ||
51 | + Returns | ||
52 | + ------- | ||
53 | + sentence bow : scipy.sparse.csr_matrix | ||
54 | + shape = (n_sents, n_terms) | ||
55 | + """ | ||
56 | + rows, cols, data = [], [], [] | ||
57 | + for i, tokens_i in enumerate(tokens): | ||
58 | + for t, c in Counter(tokens_i).items(): | ||
59 | + j = vocab_to_idx.get(t, -1) | ||
60 | + if j == -1: | ||
61 | + continue | ||
62 | + rows.append(i) | ||
63 | + cols.append(j) | ||
64 | + data.append(c) | ||
65 | + n_sents = len(tokens) | ||
66 | + n_terms = len(vocab_to_idx) | ||
67 | + x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms)) | ||
68 | + return x | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment