Showing
1 changed file
with
190 additions
and
0 deletions
src/textrank/summarizer.py
0 → 100644
1 | +import numpy as np | ||
2 | +from .rank import pagerank | ||
3 | +from .sentence import sent_graph | ||
4 | +from .word import word_graph | ||
5 | + | ||
6 | + | ||
7 | +class KeywordSummarizer: | ||
8 | + """ | ||
9 | + Arguments | ||
10 | + --------- | ||
11 | + sents : list of str | ||
12 | + Sentence list | ||
13 | + tokenize : callable | ||
14 | + Tokenize function: tokenize(str) = list of str | ||
15 | + min_count : int | ||
16 | + Minumum frequency of words will be used to construct sentence graph | ||
17 | + window : int | ||
18 | + Word cooccurrence window size. Default is -1. | ||
19 | + '-1' means there is cooccurrence between two words if the words occur in a sentence | ||
20 | + min_cooccurrence : int | ||
21 | + Minimum cooccurrence frequency of two words | ||
22 | + vocab_to_idx : dict or None | ||
23 | + Vocabulary to index mapper | ||
24 | + df : float | ||
25 | + PageRank damping factor | ||
26 | + max_iter : int | ||
27 | + Number of PageRank iterations | ||
28 | + verbose : Boolean | ||
29 | + If True, it shows training progress | ||
30 | + """ | ||
31 | + def __init__(self, sents=None, tokenize=None, min_count=2, | ||
32 | + window=-1, min_cooccurrence=2, vocab_to_idx=None, | ||
33 | + df=0.85, max_iter=30, verbose=False): | ||
34 | + | ||
35 | + self.tokenize = tokenize | ||
36 | + self.min_count = min_count | ||
37 | + self.window = window | ||
38 | + self.min_cooccurrence = min_cooccurrence | ||
39 | + self.vocab_to_idx = vocab_to_idx | ||
40 | + self.df = df | ||
41 | + self.max_iter = max_iter | ||
42 | + self.verbose = verbose | ||
43 | + | ||
44 | + if sents is not None: | ||
45 | + self.train_textrank(sents) | ||
46 | + | ||
47 | + def train_textrank(self, sents, bias=None): | ||
48 | + """ | ||
49 | + Arguments | ||
50 | + --------- | ||
51 | + sents : list of str | ||
52 | + Sentence list | ||
53 | + bias : None or numpy.ndarray | ||
54 | + PageRank bias term | ||
55 | + Returns | ||
56 | + ------- | ||
57 | + None | ||
58 | + """ | ||
59 | + | ||
60 | + g, self.idx_to_vocab = word_graph(sents, | ||
61 | + self.tokenize, self.min_count,self.window, | ||
62 | + self.min_cooccurrence, self.vocab_to_idx, self.verbose) | ||
63 | + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) | ||
64 | + if self.verbose: | ||
65 | + print('trained TextRank. n words = {}'.format(self.R.shape[0])) | ||
66 | + | ||
67 | + def keywords(self, topk=30): | ||
68 | + """ | ||
69 | + Arguments | ||
70 | + --------- | ||
71 | + topk : int | ||
72 | + Number of keywords selected from TextRank | ||
73 | + Returns | ||
74 | + ------- | ||
75 | + keywords : list of tuple | ||
76 | + Each tuple stands for (word, rank) | ||
77 | + """ | ||
78 | + if not hasattr(self, 'R'): | ||
79 | + raise RuntimeError('Train textrank first or use summarize function') | ||
80 | + idxs = self.R.argsort()[-topk:] | ||
81 | + keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)] | ||
82 | + return keywords | ||
83 | + | ||
84 | + def summarize(self, sents, topk=30): | ||
85 | + """ | ||
86 | + Arguments | ||
87 | + --------- | ||
88 | + sents : list of str | ||
89 | + Sentence list | ||
90 | + topk : int | ||
91 | + Number of keywords selected from TextRank | ||
92 | + Returns | ||
93 | + ------- | ||
94 | + keywords : list of tuple | ||
95 | + Each tuple stands for (word, rank) | ||
96 | + """ | ||
97 | + self.train_textrank(sents) | ||
98 | + return self.keywords(topk) | ||
99 | + | ||
100 | + | ||
101 | +class KeysentenceSummarizer: | ||
102 | + """ | ||
103 | + Arguments | ||
104 | + --------- | ||
105 | + sents : list of str | ||
106 | + Sentence list | ||
107 | + tokenize : callable | ||
108 | + Tokenize function: tokenize(str) = list of str | ||
109 | + min_count : int | ||
110 | + Minumum frequency of words will be used to construct sentence graph | ||
111 | + min_sim : float | ||
112 | + Minimum similarity between sentences in sentence graph | ||
113 | + similarity : str | ||
114 | + available similarity = ['cosine', 'textrank'] | ||
115 | + vocab_to_idx : dict or None | ||
116 | + Vocabulary to index mapper | ||
117 | + df : float | ||
118 | + PageRank damping factor | ||
119 | + max_iter : int | ||
120 | + Number of PageRank iterations | ||
121 | + verbose : Boolean | ||
122 | + If True, it shows training progress | ||
123 | + """ | ||
124 | + def __init__(self, sents=None, tokenize=None, min_count=2, | ||
125 | + min_sim=0.3, similarity=None, vocab_to_idx=None, | ||
126 | + df=0.85, max_iter=30, verbose=False): | ||
127 | + | ||
128 | + self.tokenize = tokenize | ||
129 | + self.min_count = min_count | ||
130 | + self.min_sim = min_sim | ||
131 | + self.similarity = similarity | ||
132 | + self.vocab_to_idx = vocab_to_idx | ||
133 | + self.df = df | ||
134 | + self.max_iter = max_iter | ||
135 | + self.verbose = verbose | ||
136 | + | ||
137 | + if sents is not None: | ||
138 | + self.train_textrank(sents) | ||
139 | + | ||
140 | + def train_textrank(self, sents, bias=None): | ||
141 | + """ | ||
142 | + Arguments | ||
143 | + --------- | ||
144 | + sents : list of str | ||
145 | + Sentence list | ||
146 | + bias : None or numpy.ndarray | ||
147 | + PageRank bias term | ||
148 | + Shape must be (n_sents,) | ||
149 | + Returns | ||
150 | + ------- | ||
151 | + None | ||
152 | + """ | ||
153 | + g = sent_graph(sents, self.tokenize, self.min_count, | ||
154 | + self.min_sim, self.similarity, self.vocab_to_idx, self.verbose) | ||
155 | + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) | ||
156 | + if self.verbose: | ||
157 | + print('trained TextRank. n sentences = {}'.format(self.R.shape[0])) | ||
158 | + | ||
159 | + def summarize(self, sents, topk=30, bias=None): | ||
160 | + """ | ||
161 | + Arguments | ||
162 | + --------- | ||
163 | + sents : list of str | ||
164 | + Sentence list | ||
165 | + topk : int | ||
166 | + Number of key-sentences to be selected. | ||
167 | + bias : None or numpy.ndarray | ||
168 | + PageRank bias term | ||
169 | + Shape must be (n_sents,) | ||
170 | + Returns | ||
171 | + ------- | ||
172 | + keysents : list of tuple | ||
173 | + Each tuple stands for (sentence index, rank, sentence) | ||
174 | + Usage | ||
175 | + ----- | ||
176 | + >>> from textrank import KeysentenceSummarizer | ||
177 | + >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5) | ||
178 | + >>> keysents = summarizer.summarize(texts, topk=30) | ||
179 | + """ | ||
180 | + n_sents = len(sents) | ||
181 | + if isinstance(bias, np.ndarray): | ||
182 | + if bias.shape != (n_sents,): | ||
183 | + raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) | ||
184 | + elif bias is not None: | ||
185 | + raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) | ||
186 | + | ||
187 | + self.train_textrank(sents, bias) | ||
188 | + idxs = self.R.argsort()[-topk:] | ||
189 | + keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | ||
190 | + return keysents | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment