GyuhoLee

[Add] textrank algorithm code

1 +__name__ = 'textrank'
2 +__author__ = 'GyuhoLee'
3 +__version__ = '0.0.1'
4 +
5 +from .summarizer import KeywordSummarizer
6 +from .summarizer import KeysentenceSummarizer
...\ No newline at end of file ...\ No newline at end of file
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
1 +import numpy as np
2 +from sklearn.preprocessing import normalize
3 +
4 +def pagerank(x, df=0.85, max_iter=30, bias=None):
5 + """
6 + Arguments
7 + ---------
8 + x : scipy.sparse.csr_matrix
9 + shape = (n vertex, n vertex)
10 + df : float
11 + Damping factor, 0 < df < 1
12 + max_iter : int
13 + Maximum number of iteration
14 + bias : numpy.ndarray or None
15 + If None, equal bias
16 + Returns
17 + -------
18 + R : numpy.ndarray
19 + PageRank vector. shape = (n vertex, 1)
20 + """
21 +
22 + assert 0 < df < 1
23 +
24 + # initialize
25 + A = normalize(x, axis=0, norm='l1')
26 + R = np.ones(A.shape[0]).reshape(-1,1)
27 +
28 + # check bias
29 + if bias is None:
30 + bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
31 + else:
32 + bias = bias.reshape(-1,1)
33 + bias = A.shape[0] * bias / bias.sum()
34 + assert bias.shape[0] == A.shape[0]
35 + bias = (1 - df) * bias
36 +
37 + # iteration
38 + for _ in range(max_iter):
39 + R = df * (A * R) + bias
40 +
41 + return R
...\ No newline at end of file ...\ No newline at end of file
1 +from collections import Counter
2 +import math
3 +import numpy as np
4 +import scipy as sp
5 +from scipy.sparse import csr_matrix
6 +from sklearn.metrics import pairwise_distances
7 +
8 +from .utils import scan_vocabulary
9 +from .utils import tokenize_sents
10 +
11 +
12 +def sent_graph(sents, tokenize=None, min_count=2, min_sim=0.3,
13 + similarity=None, vocab_to_idx=None, verbose=False):
14 + """
15 + Arguments
16 + ---------
17 + sents : list of str
18 + Sentence list
19 + tokenize : callable
20 + tokenize(sent) return list of str
21 + min_count : int
22 + Minimum term frequency
23 + min_sim : float
24 + Minimum similarity between sentences
25 + similarity : callable or str
26 + similarity(s1, s2) returns float
27 + s1 and s2 are list of str.
28 + available similarity = [callable, 'cosine', 'textrank']
29 + vocab_to_idx : dict
30 + Vocabulary to index mapper.
31 + If None, this function scan vocabulary first.
32 + verbose : Boolean
33 + If True, verbose mode on
34 + Returns
35 + -------
36 + sentence similarity graph : scipy.sparse.csr_matrix
37 + shape = (n sents, n sents)
38 + """
39 +
40 + if vocab_to_idx is None:
41 + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
42 + else:
43 + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
44 +
45 + x = vectorize_sents(sents, tokenize, vocab_to_idx)
46 + if similarity == 'cosine':
47 + x = numpy_cosine_similarity_matrix(x, min_sim, verbose, batch_size=1000)
48 + else:
49 + x = numpy_textrank_similarity_matrix(x, min_sim, verbose, batch_size=1000)
50 + return x
51 +
52 +def vectorize_sents(sents, tokenize, vocab_to_idx):
53 + rows, cols, data = [], [], []
54 + for i, sent in enumerate(sents):
55 + counter = Counter(tokenize(sent))
56 + for token, count in counter.items():
57 + j = vocab_to_idx.get(token, -1)
58 + if j == -1:
59 + continue
60 + rows.append(i)
61 + cols.append(j)
62 + data.append(count)
63 + n_rows = len(sents)
64 + n_cols = len(vocab_to_idx)
65 + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
66 +
67 +def numpy_cosine_similarity_matrix(x, min_sim=0.3, verbose=True, batch_size=1000):
68 + n_rows = x.shape[0]
69 + mat = []
70 + for bidx in range(math.ceil(n_rows / batch_size)):
71 + b = int(bidx * batch_size)
72 + e = min(n_rows, int((bidx+1) * batch_size))
73 + psim = 1 - pairwise_distances(x[b:e], x, metric='cosine')
74 + rows, cols = np.where(psim >= min_sim)
75 + data = psim[rows, cols]
76 + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))
77 + if verbose:
78 + print('\rcalculating cosine sentence similarity {} / {}'.format(b, n_rows), end='')
79 + mat = sp.sparse.vstack(mat)
80 + if verbose:
81 + print('\rcalculating cosine sentence similarity was done with {} sents'.format(n_rows))
82 + return mat
83 +
84 +def numpy_textrank_similarity_matrix(x, min_sim=0.3, verbose=True, min_length=1, batch_size=1000):
85 + n_rows, n_cols = x.shape
86 +
87 + # Boolean matrix
88 + rows, cols = x.nonzero()
89 + data = np.ones(rows.shape[0])
90 + z = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
91 +
92 + # Inverse sentence length
93 + size = np.asarray(x.sum(axis=1)).reshape(-1)
94 + size[np.where(size <= min_length)] = 10000
95 + size = np.log(size)
96 +
97 + mat = []
98 + for bidx in range(math.ceil(n_rows / batch_size)):
99 +
100 + # slicing
101 + b = int(bidx * batch_size)
102 + e = min(n_rows, int((bidx+1) * batch_size))
103 +
104 + # dot product
105 + inner = z[b:e,:] * z.transpose()
106 +
107 + # sentence len[i,j] = size[i] + size[j]
108 + norm = size[b:e].reshape(-1,1) + size.reshape(1,-1)
109 + norm = norm ** (-1)
110 + norm[np.where(norm == np.inf)] = 0
111 +
112 + # normalize
113 + sim = inner.multiply(norm).tocsr()
114 + rows, cols = (sim >= min_sim).nonzero()
115 + data = np.asarray(sim[rows, cols]).reshape(-1)
116 +
117 + # append
118 + mat.append(csr_matrix((data, (rows, cols)), shape=(e-b, n_rows)))
119 +
120 + if verbose:
121 + print('\rcalculating textrank sentence similarity {} / {}'.format(b, n_rows), end='')
122 +
123 + mat = sp.sparse.vstack(mat)
124 + if verbose:
125 + print('\rcalculating textrank sentence similarity was done with {} sents'.format(n_rows))
126 +
127 + return mat
128 +
129 +def graph_with_python_sim(tokens, verbose, similarity, min_sim):
130 + if similarity == 'cosine':
131 + similarity = cosine_sent_sim
132 + elif callable(similarity):
133 + similarity = similarity
134 + else:
135 + similarity = textrank_sent_sim
136 +
137 + rows, cols, data = [], [], []
138 + n_sents = len(tokens)
139 + for i, tokens_i in enumerate(tokens):
140 + if verbose and i % 1000 == 0:
141 + print('\rconstructing sentence graph {} / {} ...'.format(i, n_sents), end='')
142 + for j, tokens_j in enumerate(tokens):
143 + if i >= j:
144 + continue
145 + sim = similarity(tokens_i, tokens_j)
146 + if sim < min_sim:
147 + continue
148 + rows.append(i)
149 + cols.append(j)
150 + data.append(sim)
151 + if verbose:
152 + print('\rconstructing sentence graph was constructed from {} sents'.format(n_sents))
153 + return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))
154 +
155 +def textrank_sent_sim(s1, s2):
156 + """
157 + Arguments
158 + ---------
159 + s1, s2 : list of str
160 + Tokenized sentences
161 + Returns
162 + -------
163 + Sentence similarity : float
164 + Non-negative number
165 + """
166 + n1 = len(s1)
167 + n2 = len(s2)
168 + if (n1 <= 1) or (n2 <= 1):
169 + return 0
170 + common = len(set(s1).intersection(set(s2)))
171 + base = math.log(n1) + math.log(n2)
172 + return common / base
173 +
174 +def cosine_sent_sim(s1, s2):
175 + """
176 + Arguments
177 + ---------
178 + s1, s2 : list of str
179 + Tokenized sentences
180 + Returns
181 + -------
182 + Sentence similarity : float
183 + Non-negative number
184 + """
185 + if (not s1) or (not s2):
186 + return 0
187 +
188 + s1 = Counter(s1)
189 + s2 = Counter(s2)
190 + norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
191 + norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
192 + prod = 0
193 + for k, v in s1.items():
194 + prod += v * s2.get(k, 0)
195 + return prod / (norm1 * norm2)
...\ No newline at end of file ...\ No newline at end of file
1 +import numpy as np
2 +from .rank import pagerank
3 +from .sentence import sent_graph
4 +from .word import word_graph
5 +
6 +
7 +class KeywordSummarizer:
8 + """
9 + Arguments
10 + ---------
11 + sents : list of str
12 + Sentence list
13 + tokenize : callable
14 + Tokenize function: tokenize(str) = list of str
15 + min_count : int
16 + Minumum frequency of words will be used to construct sentence graph
17 + window : int
18 + Word cooccurrence window size. Default is -1.
19 + '-1' means there is cooccurrence between two words if the words occur in a sentence
20 + min_cooccurrence : int
21 + Minimum cooccurrence frequency of two words
22 + vocab_to_idx : dict or None
23 + Vocabulary to index mapper
24 + df : float
25 + PageRank damping factor
26 + max_iter : int
27 + Number of PageRank iterations
28 + verbose : Boolean
29 + If True, it shows training progress
30 + """
31 + def __init__(self, sents=None, tokenize=None, min_count=2,
32 + window=-1, min_cooccurrence=2, vocab_to_idx=None,
33 + df=0.85, max_iter=30, verbose=False):
34 +
35 + self.tokenize = tokenize
36 + self.min_count = min_count
37 + self.window = window
38 + self.min_cooccurrence = min_cooccurrence
39 + self.vocab_to_idx = vocab_to_idx
40 + self.df = df
41 + self.max_iter = max_iter
42 + self.verbose = verbose
43 +
44 + if sents is not None:
45 + self.train_textrank(sents)
46 +
47 + def train_textrank(self, sents, bias=None):
48 + """
49 + Arguments
50 + ---------
51 + sents : list of str
52 + Sentence list
53 + bias : None or numpy.ndarray
54 + PageRank bias term
55 + Returns
56 + -------
57 + None
58 + """
59 +
60 + g, self.idx_to_vocab = word_graph(sents,
61 + self.tokenize, self.min_count,self.window,
62 + self.min_cooccurrence, self.vocab_to_idx, self.verbose)
63 + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
64 + if self.verbose:
65 + print('trained TextRank. n words = {}'.format(self.R.shape[0]))
66 +
67 + def keywords(self, topk=30):
68 + """
69 + Arguments
70 + ---------
71 + topk : int
72 + Number of keywords selected from TextRank
73 + Returns
74 + -------
75 + keywords : list of tuple
76 + Each tuple stands for (word, rank)
77 + """
78 + if not hasattr(self, 'R'):
79 + raise RuntimeError('Train textrank first or use summarize function')
80 + idxs = self.R.argsort()[-topk:]
81 + keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
82 + return keywords
83 +
84 + def summarize(self, sents, topk=30):
85 + """
86 + Arguments
87 + ---------
88 + sents : list of str
89 + Sentence list
90 + topk : int
91 + Number of keywords selected from TextRank
92 + Returns
93 + -------
94 + keywords : list of tuple
95 + Each tuple stands for (word, rank)
96 + """
97 + self.train_textrank(sents)
98 + return self.keywords(topk)
99 +
100 +
101 +class KeysentenceSummarizer:
102 + """
103 + Arguments
104 + ---------
105 + sents : list of str
106 + Sentence list
107 + tokenize : callable
108 + Tokenize function: tokenize(str) = list of str
109 + min_count : int
110 + Minumum frequency of words will be used to construct sentence graph
111 + min_sim : float
112 + Minimum similarity between sentences in sentence graph
113 + similarity : str
114 + available similarity = ['cosine', 'textrank']
115 + vocab_to_idx : dict or None
116 + Vocabulary to index mapper
117 + df : float
118 + PageRank damping factor
119 + max_iter : int
120 + Number of PageRank iterations
121 + verbose : Boolean
122 + If True, it shows training progress
123 + """
124 + def __init__(self, sents=None, tokenize=None, min_count=2,
125 + min_sim=0.3, similarity=None, vocab_to_idx=None,
126 + df=0.85, max_iter=30, verbose=False):
127 +
128 + self.tokenize = tokenize
129 + self.min_count = min_count
130 + self.min_sim = min_sim
131 + self.similarity = similarity
132 + self.vocab_to_idx = vocab_to_idx
133 + self.df = df
134 + self.max_iter = max_iter
135 + self.verbose = verbose
136 +
137 + if sents is not None:
138 + self.train_textrank(sents)
139 +
140 + def train_textrank(self, sents, bias=None):
141 + """
142 + Arguments
143 + ---------
144 + sents : list of str
145 + Sentence list
146 + bias : None or numpy.ndarray
147 + PageRank bias term
148 + Shape must be (n_sents,)
149 + Returns
150 + -------
151 + None
152 + """
153 + g = sent_graph(sents, self.tokenize, self.min_count,
154 + self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
155 + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
156 + if self.verbose:
157 + print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))
158 +
159 + def summarize(self, sents, topk=30, bias=None):
160 + """
161 + Arguments
162 + ---------
163 + sents : list of str
164 + Sentence list
165 + topk : int
166 + Number of key-sentences to be selected.
167 + bias : None or numpy.ndarray
168 + PageRank bias term
169 + Shape must be (n_sents,)
170 + Returns
171 + -------
172 + keysents : list of tuple
173 + Each tuple stands for (sentence index, rank, sentence)
174 + Usage
175 + -----
176 + >>> from textrank import KeysentenceSummarizer
177 + >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
178 + >>> keysents = summarizer.summarize(texts, topk=30)
179 + """
180 + n_sents = len(sents)
181 + if isinstance(bias, np.ndarray):
182 + if bias.shape != (n_sents,):
183 + raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
184 + elif bias is not None:
185 + raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
186 + self.train_textrank(sents, bias)
187 + idxs = self.R.argsort()[-topk:]
188 + keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
189 + return keysents
...\ No newline at end of file ...\ No newline at end of file
1 +from collections import Counter
2 +from scipy.sparse import csr_matrix
3 +import numpy as np
4 +
5 +
6 +def scan_vocabulary(sents, tokenize=None, min_count=2):
7 + """
8 + Arguments
9 + ---------
10 + sents : list of str
11 + Sentence list
12 + tokenize : callable
13 + tokenize(str) returns list of str
14 + min_count : int
15 + Minumum term frequency
16 + Returns
17 + -------
18 + idx_to_vocab : list of str
19 + Vocabulary list
20 + vocab_to_idx : dict
21 + Vocabulary to index mapper.
22 + """
23 + counter = Counter(w for sent in sents for w in tokenize(sent))
24 + counter = {w:c for w,c in counter.items() if c >= min_count}
25 + idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
26 + vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
27 + return idx_to_vocab, vocab_to_idx
28 +
29 +def tokenize_sents(sents, tokenize):
30 + """
31 + Arguments
32 + ---------
33 + sents : list of str
34 + Sentence list
35 + tokenize : callable
36 + tokenize(sent) returns list of str (word sequence)
37 + Returns
38 + -------
39 + tokenized sentence list : list of list of str
40 + """
41 + if tokenize is not None:
42 + return [tokenize(sent) for sent in sents]
43 + else:
44 + return sents
45 +
46 +def vectorize(tokens, vocab_to_idx):
47 + """
48 + Arguments
49 + ---------
50 + tokens : list of list of str
51 + Tokenzed sentence list
52 + vocab_to_idx : dict
53 + Vocabulary to index mapper
54 + Returns
55 + -------
56 + sentence bow : scipy.sparse.csr_matrix
57 + shape = (n_sents, n_terms)
58 + """
59 + rows, cols, data = [], [], []
60 + for i, tokens_i in enumerate(tokens):
61 + for t, c in Counter(tokens_i).items():
62 + j = vocab_to_idx.get(t, -1)
63 + if j == -1:
64 + continue
65 + rows.append(i)
66 + cols.append(j)
67 + data.append(c)
68 + n_sents = len(tokens)
69 + n_terms = len(vocab_to_idx)
70 + x = csr_matrix((data, (rows, cols)), shape=(n_sents, n_terms))
71 + return x
...\ No newline at end of file ...\ No newline at end of file
1 +from collections import defaultdict
2 +from scipy.sparse import csr_matrix
3 +
4 +from .utils import scan_vocabulary
5 +from .utils import tokenize_sents
6 +
7 +
8 +def word_graph(sents, tokenize=None, min_count=2, window=2,
9 + min_cooccurrence=2, vocab_to_idx=None, verbose=False):
10 + """
11 + Arguments
12 + ---------
13 + sents : list of str
14 + Sentence list
15 + tokenize : callable
16 + tokenize(str) returns list of str
17 + min_count : int
18 + Minumum term frequency
19 + window : int
20 + Co-occurrence window size
21 + min_cooccurrence : int
22 + Minimum cooccurrence frequency
23 + vocab_to_idx : dict
24 + Vocabulary to index mapper.
25 + If None, this function scan vocabulary first.
26 + verbose : Boolean
27 + If True, verbose mode on
28 + Returns
29 + -------
30 + co-occurrence word graph : scipy.sparse.csr_matrix
31 + idx_to_vocab : list of str
32 + Word list corresponding row and column
33 + """
34 + if vocab_to_idx is None:
35 + idx_to_vocab, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)
36 + else:
37 + idx_to_vocab = [vocab for vocab, _ in sorted(vocab_to_idx.items(), key=lambda x:x[1])]
38 +
39 + tokens = tokenize_sents(sents, tokenize)
40 + g = cooccurrence(tokens, vocab_to_idx, window, min_cooccurrence, verbose)
41 + return g, idx_to_vocab
42 +
43 +def cooccurrence(tokens, vocab_to_idx, window=2, min_cooccurrence=2, verbose=False):
44 + """
45 + Arguments
46 + ---------
47 + tokens : list of list of str
48 + Tokenized sentence list
49 + vocab_to_idx : dict
50 + Vocabulary to index mapper
51 + window : int
52 + Co-occurrence window size
53 + min_cooccurrence : int
54 + Minimum cooccurrence frequency
55 + verbose : Boolean
56 + If True, verbose mode on
57 + Returns
58 + -------
59 + co-occurrence matrix : scipy.sparse.csr_matrix
60 + shape = (n_vocabs, n_vocabs)
61 + """
62 + counter = defaultdict(int)
63 + for s, tokens_i in enumerate(tokens):
64 + if verbose and s % 1000 == 0:
65 + print('\rword cooccurrence counting {}'.format(s), end='')
66 + vocabs = [vocab_to_idx[w] for w in tokens_i if w in vocab_to_idx]
67 + n = len(vocabs)
68 + for i, v in enumerate(vocabs):
69 + if window <= 0:
70 + b, e = 0, n
71 + else:
72 + b = max(0, i - window)
73 + e = min(i + window, n)
74 + for j in range(b, e):
75 + if i == j:
76 + continue
77 + counter[(v, vocabs[j])] += 1
78 + counter[(vocabs[j], v)] += 1
79 + counter = {k:v for k,v in counter.items() if v >= min_cooccurrence}
80 + n_vocabs = len(vocab_to_idx)
81 + if verbose:
82 + print('\rword cooccurrence counting from {} sents was done'.format(s+1))
83 + return dict_to_mat(counter, n_vocabs, n_vocabs)
84 +
85 +def dict_to_mat(d, n_rows, n_cols):
86 + """
87 + Arguments
88 + ---------
89 + d : dict
90 + key : (i,j) tuple
91 + value : float value
92 + Returns
93 + -------
94 + scipy.sparse.csr_matrix
95 + """
96 + rows, cols, data = [], [], []
97 + for (i, j), v in d.items():
98 + rows.append(i)
99 + cols.append(j)
100 + data.append(v)
101 + return csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))
...\ No newline at end of file ...\ No newline at end of file