GyuhoLee

[Add] 클래스화한 Summarizer 추가

1 +import numpy as np
2 +from .rank import pagerank
3 +from .sentence import sent_graph
4 +from .word import word_graph
5 +
6 +
7 +class KeywordSummarizer:
8 + """
9 + Arguments
10 + ---------
11 + sents : list of str
12 + Sentence list
13 + tokenize : callable
14 + Tokenize function: tokenize(str) = list of str
15 + min_count : int
16 + Minumum frequency of words will be used to construct sentence graph
17 + window : int
18 + Word cooccurrence window size. Default is -1.
19 + '-1' means there is cooccurrence between two words if the words occur in a sentence
20 + min_cooccurrence : int
21 + Minimum cooccurrence frequency of two words
22 + vocab_to_idx : dict or None
23 + Vocabulary to index mapper
24 + df : float
25 + PageRank damping factor
26 + max_iter : int
27 + Number of PageRank iterations
28 + verbose : Boolean
29 + If True, it shows training progress
30 + """
31 + def __init__(self, sents=None, tokenize=None, min_count=2,
32 + window=-1, min_cooccurrence=2, vocab_to_idx=None,
33 + df=0.85, max_iter=30, verbose=False):
34 +
35 + self.tokenize = tokenize
36 + self.min_count = min_count
37 + self.window = window
38 + self.min_cooccurrence = min_cooccurrence
39 + self.vocab_to_idx = vocab_to_idx
40 + self.df = df
41 + self.max_iter = max_iter
42 + self.verbose = verbose
43 +
44 + if sents is not None:
45 + self.train_textrank(sents)
46 +
47 + def train_textrank(self, sents, bias=None):
48 + """
49 + Arguments
50 + ---------
51 + sents : list of str
52 + Sentence list
53 + bias : None or numpy.ndarray
54 + PageRank bias term
55 + Returns
56 + -------
57 + None
58 + """
59 +
60 + g, self.idx_to_vocab = word_graph(sents,
61 + self.tokenize, self.min_count,self.window,
62 + self.min_cooccurrence, self.vocab_to_idx, self.verbose)
63 + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
64 + if self.verbose:
65 + print('trained TextRank. n words = {}'.format(self.R.shape[0]))
66 +
67 + def keywords(self, topk=30):
68 + """
69 + Arguments
70 + ---------
71 + topk : int
72 + Number of keywords selected from TextRank
73 + Returns
74 + -------
75 + keywords : list of tuple
76 + Each tuple stands for (word, rank)
77 + """
78 + if not hasattr(self, 'R'):
79 + raise RuntimeError('Train textrank first or use summarize function')
80 + idxs = self.R.argsort()[-topk:]
81 + keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
82 + return keywords
83 +
84 + def summarize(self, sents, topk=30):
85 + """
86 + Arguments
87 + ---------
88 + sents : list of str
89 + Sentence list
90 + topk : int
91 + Number of keywords selected from TextRank
92 + Returns
93 + -------
94 + keywords : list of tuple
95 + Each tuple stands for (word, rank)
96 + """
97 + self.train_textrank(sents)
98 + return self.keywords(topk)
99 +
100 +
101 +class KeysentenceSummarizer:
102 + """
103 + Arguments
104 + ---------
105 + sents : list of str
106 + Sentence list
107 + tokenize : callable
108 + Tokenize function: tokenize(str) = list of str
109 + min_count : int
110 + Minumum frequency of words will be used to construct sentence graph
111 + min_sim : float
112 + Minimum similarity between sentences in sentence graph
113 + similarity : str
114 + available similarity = ['cosine', 'textrank']
115 + vocab_to_idx : dict or None
116 + Vocabulary to index mapper
117 + df : float
118 + PageRank damping factor
119 + max_iter : int
120 + Number of PageRank iterations
121 + verbose : Boolean
122 + If True, it shows training progress
123 + """
124 + def __init__(self, sents=None, tokenize=None, min_count=2,
125 + min_sim=0.3, similarity=None, vocab_to_idx=None,
126 + df=0.85, max_iter=30, verbose=False):
127 +
128 + self.tokenize = tokenize
129 + self.min_count = min_count
130 + self.min_sim = min_sim
131 + self.similarity = similarity
132 + self.vocab_to_idx = vocab_to_idx
133 + self.df = df
134 + self.max_iter = max_iter
135 + self.verbose = verbose
136 +
137 + if sents is not None:
138 + self.train_textrank(sents)
139 +
140 + def train_textrank(self, sents, bias=None):
141 + """
142 + Arguments
143 + ---------
144 + sents : list of str
145 + Sentence list
146 + bias : None or numpy.ndarray
147 + PageRank bias term
148 + Shape must be (n_sents,)
149 + Returns
150 + -------
151 + None
152 + """
153 + g = sent_graph(sents, self.tokenize, self.min_count,
154 + self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
155 + self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
156 + if self.verbose:
157 + print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))
158 +
159 + def summarize(self, sents, topk=30, bias=None):
160 + """
161 + Arguments
162 + ---------
163 + sents : list of str
164 + Sentence list
165 + topk : int
166 + Number of key-sentences to be selected.
167 + bias : None or numpy.ndarray
168 + PageRank bias term
169 + Shape must be (n_sents,)
170 + Returns
171 + -------
172 + keysents : list of tuple
173 + Each tuple stands for (sentence index, rank, sentence)
174 + Usage
175 + -----
176 + >>> from textrank import KeysentenceSummarizer
177 + >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
178 + >>> keysents = summarizer.summarize(texts, topk=30)
179 + """
180 + n_sents = len(sents)
181 + if isinstance(bias, np.ndarray):
182 + if bias.shape != (n_sents,):
183 + raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
184 + elif bias is not None:
185 + raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
186 +
187 + self.train_textrank(sents, bias)
188 + idxs = self.R.argsort()[-topk:]
189 + keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
190 + return keysents
...\ No newline at end of file ...\ No newline at end of file