GyuhoLee

[Add] 자주 쓰이는 함수들(Matrix)

1 +from collections import Counter
2 +from scipy.sparse import csr_matrix
3 +import numpy as np
4 +
5 +
6 +def scan_vocabulary(sents, tokenize=None, min_count=2):
7 + """
8 + Arguments
9 + ---------
10 + sents : list of str
11 + Sentence list
12 + tokenize : callable
13 + tokenize(str) returns list of str
14 + min_count : int
15 + Minumum term frequency
16 + Returns
17 + -------
18 + idx_to_vocab : list of str
19 + Vocabulary list
20 + vocab_to_idx : dict
21 + Vocabulary to index mapper.
22 + """
23 + counter = Counter(w for sent in sents for w in tokenize(sent))
24 + counter = {w:c for w,c in counter.items() if c >= min_count}
25 + idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
26 + vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
27 + return idx_to_vocab, vocab_to_idx
28 +
29 +def tokenize_sents(sents, tokenize):
30 + """
31 + Arguments
32 + ---------
33 + sents : list of str
34 + Sentence list
35 + tokenize : callable
36 + tokenize(sent) returns list of str (word sequence)
37 + Returns
38 + -------
39 + tokenized sentence list : list of list of str
40 + """
41 + return [tokenize(sent) for sent in sents]
42 +
43 +def vectorize(tokens, vocab_to_idx):
44 + """
45 + Arguments
46 + ---------
47 + tokens : list of list of str
48 + Tokenzed sentence list
49 + vocab_to_idx : dict
50 + Vocabulary to index mapper
51 + Returns
52 + -------
53 + sentence bow : scipy.sparse.csr_matrix
54 + shape = (n_sents, n_terms)
55 + """
56 + rows, cols, data = [], [], []
57 + for i, tokens_i in enumerate(tokens):
58 + for t, c in Counter(tokens_i).items():
59 + j = vocab_to_idx.get(t, -1)
60 + if j == -1:
61 + continue
62 + rows.append(i)
63 + cols.append(j)
64 + data.append(c)
65 + n_sents = len(tokens)
66 + n_terms = len(vocab_to_idx)
67 + x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms))
68 + return x
...\ No newline at end of file ...\ No newline at end of file