Showing
1 changed file
with
190 additions
and
0 deletions
src/textrank/summarizer.py
0 → 100644
| 1 | +import numpy as np | ||
| 2 | +from .rank import pagerank | ||
| 3 | +from .sentence import sent_graph | ||
| 4 | +from .word import word_graph | ||
| 5 | + | ||
| 6 | + | ||
| 7 | +class KeywordSummarizer: | ||
| 8 | + """ | ||
| 9 | + Arguments | ||
| 10 | + --------- | ||
| 11 | + sents : list of str | ||
| 12 | + Sentence list | ||
| 13 | + tokenize : callable | ||
| 14 | + Tokenize function: tokenize(str) = list of str | ||
| 15 | + min_count : int | ||
| 16 | + Minumum frequency of words will be used to construct sentence graph | ||
| 17 | + window : int | ||
| 18 | + Word cooccurrence window size. Default is -1. | ||
| 19 | + '-1' means there is cooccurrence between two words if the words occur in a sentence | ||
| 20 | + min_cooccurrence : int | ||
| 21 | + Minimum cooccurrence frequency of two words | ||
| 22 | + vocab_to_idx : dict or None | ||
| 23 | + Vocabulary to index mapper | ||
| 24 | + df : float | ||
| 25 | + PageRank damping factor | ||
| 26 | + max_iter : int | ||
| 27 | + Number of PageRank iterations | ||
| 28 | + verbose : Boolean | ||
| 29 | + If True, it shows training progress | ||
| 30 | + """ | ||
| 31 | + def __init__(self, sents=None, tokenize=None, min_count=2, | ||
| 32 | + window=-1, min_cooccurrence=2, vocab_to_idx=None, | ||
| 33 | + df=0.85, max_iter=30, verbose=False): | ||
| 34 | + | ||
| 35 | + self.tokenize = tokenize | ||
| 36 | + self.min_count = min_count | ||
| 37 | + self.window = window | ||
| 38 | + self.min_cooccurrence = min_cooccurrence | ||
| 39 | + self.vocab_to_idx = vocab_to_idx | ||
| 40 | + self.df = df | ||
| 41 | + self.max_iter = max_iter | ||
| 42 | + self.verbose = verbose | ||
| 43 | + | ||
| 44 | + if sents is not None: | ||
| 45 | + self.train_textrank(sents) | ||
| 46 | + | ||
| 47 | + def train_textrank(self, sents, bias=None): | ||
| 48 | + """ | ||
| 49 | + Arguments | ||
| 50 | + --------- | ||
| 51 | + sents : list of str | ||
| 52 | + Sentence list | ||
| 53 | + bias : None or numpy.ndarray | ||
| 54 | + PageRank bias term | ||
| 55 | + Returns | ||
| 56 | + ------- | ||
| 57 | + None | ||
| 58 | + """ | ||
| 59 | + | ||
| 60 | + g, self.idx_to_vocab = word_graph(sents, | ||
| 61 | + self.tokenize, self.min_count,self.window, | ||
| 62 | + self.min_cooccurrence, self.vocab_to_idx, self.verbose) | ||
| 63 | + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) | ||
| 64 | + if self.verbose: | ||
| 65 | + print('trained TextRank. n words = {}'.format(self.R.shape[0])) | ||
| 66 | + | ||
| 67 | + def keywords(self, topk=30): | ||
| 68 | + """ | ||
| 69 | + Arguments | ||
| 70 | + --------- | ||
| 71 | + topk : int | ||
| 72 | + Number of keywords selected from TextRank | ||
| 73 | + Returns | ||
| 74 | + ------- | ||
| 75 | + keywords : list of tuple | ||
| 76 | + Each tuple stands for (word, rank) | ||
| 77 | + """ | ||
| 78 | + if not hasattr(self, 'R'): | ||
| 79 | + raise RuntimeError('Train textrank first or use summarize function') | ||
| 80 | + idxs = self.R.argsort()[-topk:] | ||
| 81 | + keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)] | ||
| 82 | + return keywords | ||
| 83 | + | ||
| 84 | + def summarize(self, sents, topk=30): | ||
| 85 | + """ | ||
| 86 | + Arguments | ||
| 87 | + --------- | ||
| 88 | + sents : list of str | ||
| 89 | + Sentence list | ||
| 90 | + topk : int | ||
| 91 | + Number of keywords selected from TextRank | ||
| 92 | + Returns | ||
| 93 | + ------- | ||
| 94 | + keywords : list of tuple | ||
| 95 | + Each tuple stands for (word, rank) | ||
| 96 | + """ | ||
| 97 | + self.train_textrank(sents) | ||
| 98 | + return self.keywords(topk) | ||
| 99 | + | ||
| 100 | + | ||
| 101 | +class KeysentenceSummarizer: | ||
| 102 | + """ | ||
| 103 | + Arguments | ||
| 104 | + --------- | ||
| 105 | + sents : list of str | ||
| 106 | + Sentence list | ||
| 107 | + tokenize : callable | ||
| 108 | + Tokenize function: tokenize(str) = list of str | ||
| 109 | + min_count : int | ||
| 110 | + Minumum frequency of words will be used to construct sentence graph | ||
| 111 | + min_sim : float | ||
| 112 | + Minimum similarity between sentences in sentence graph | ||
| 113 | + similarity : str | ||
| 114 | + available similarity = ['cosine', 'textrank'] | ||
| 115 | + vocab_to_idx : dict or None | ||
| 116 | + Vocabulary to index mapper | ||
| 117 | + df : float | ||
| 118 | + PageRank damping factor | ||
| 119 | + max_iter : int | ||
| 120 | + Number of PageRank iterations | ||
| 121 | + verbose : Boolean | ||
| 122 | + If True, it shows training progress | ||
| 123 | + """ | ||
| 124 | + def __init__(self, sents=None, tokenize=None, min_count=2, | ||
| 125 | + min_sim=0.3, similarity=None, vocab_to_idx=None, | ||
| 126 | + df=0.85, max_iter=30, verbose=False): | ||
| 127 | + | ||
| 128 | + self.tokenize = tokenize | ||
| 129 | + self.min_count = min_count | ||
| 130 | + self.min_sim = min_sim | ||
| 131 | + self.similarity = similarity | ||
| 132 | + self.vocab_to_idx = vocab_to_idx | ||
| 133 | + self.df = df | ||
| 134 | + self.max_iter = max_iter | ||
| 135 | + self.verbose = verbose | ||
| 136 | + | ||
| 137 | + if sents is not None: | ||
| 138 | + self.train_textrank(sents) | ||
| 139 | + | ||
| 140 | + def train_textrank(self, sents, bias=None): | ||
| 141 | + """ | ||
| 142 | + Arguments | ||
| 143 | + --------- | ||
| 144 | + sents : list of str | ||
| 145 | + Sentence list | ||
| 146 | + bias : None or numpy.ndarray | ||
| 147 | + PageRank bias term | ||
| 148 | + Shape must be (n_sents,) | ||
| 149 | + Returns | ||
| 150 | + ------- | ||
| 151 | + None | ||
| 152 | + """ | ||
| 153 | + g = sent_graph(sents, self.tokenize, self.min_count, | ||
| 154 | + self.min_sim, self.similarity, self.vocab_to_idx, self.verbose) | ||
| 155 | + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1) | ||
| 156 | + if self.verbose: | ||
| 157 | + print('trained TextRank. n sentences = {}'.format(self.R.shape[0])) | ||
| 158 | + | ||
| 159 | + def summarize(self, sents, topk=30, bias=None): | ||
| 160 | + """ | ||
| 161 | + Arguments | ||
| 162 | + --------- | ||
| 163 | + sents : list of str | ||
| 164 | + Sentence list | ||
| 165 | + topk : int | ||
| 166 | + Number of key-sentences to be selected. | ||
| 167 | + bias : None or numpy.ndarray | ||
| 168 | + PageRank bias term | ||
| 169 | + Shape must be (n_sents,) | ||
| 170 | + Returns | ||
| 171 | + ------- | ||
| 172 | + keysents : list of tuple | ||
| 173 | + Each tuple stands for (sentence index, rank, sentence) | ||
| 174 | + Usage | ||
| 175 | + ----- | ||
| 176 | + >>> from textrank import KeysentenceSummarizer | ||
| 177 | + >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5) | ||
| 178 | + >>> keysents = summarizer.summarize(texts, topk=30) | ||
| 179 | + """ | ||
| 180 | + n_sents = len(sents) | ||
| 181 | + if isinstance(bias, np.ndarray): | ||
| 182 | + if bias.shape != (n_sents,): | ||
| 183 | + raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) | ||
| 184 | + elif bias is not None: | ||
| 185 | + raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) | ||
| 186 | + | ||
| 187 | + self.train_textrank(sents, bias) | ||
| 188 | + idxs = self.R.argsort()[-topk:] | ||
| 189 | + keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | ||
| 190 | + return keysents | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment