[Add] 클래스화한 Summarizer 추가

GyuhoLee
Commit 1686df14f4a18b37580825fec87df90597107d79 1686df14 1 parent ae99d65c
Showing 1 changed file with 190 additions and 0 deletions
src/textrank/summarizer.py
--- a/src/textrank/summarizer.py 0 → 100644
View file @1686df1
+++ b/src/textrank/summarizer.py 0 → 100644
View file @1686df1
+ import numpy as np
+ from .rank import pagerank
+ from .sentence import sent_graph
+ from .word import word_graph
+ 
+ 
+ class KeywordSummarizer:
+     """
+     Arguments
+     ---------
+     sents : list of str
+         Sentence list
+     tokenize : callable
+         Tokenize function: tokenize(str) = list of str
+     min_count : int
+         Minumum frequency of words will be used to construct sentence graph
+     window : int
+         Word cooccurrence window size. Default is -1.
+         '-1' means there is cooccurrence between two words if the words occur in a sentence
+     min_cooccurrence : int
+         Minimum cooccurrence frequency of two words
+     vocab_to_idx : dict or None
+         Vocabulary to index mapper
+     df : float
+         PageRank damping factor
+     max_iter : int
+         Number of PageRank iterations
+     verbose : Boolean
+         If True, it shows training progress
+     """
+     def __init__(self, sents=None, tokenize=None, min_count=2,
+         window=-1, min_cooccurrence=2, vocab_to_idx=None,
+         df=0.85, max_iter=30, verbose=False):
+ 
+         self.tokenize = tokenize
+         self.min_count = min_count
+         self.window = window
+         self.min_cooccurrence = min_cooccurrence
+         self.vocab_to_idx = vocab_to_idx
+         self.df = df
+         self.max_iter = max_iter
+         self.verbose = verbose
+ 
+         if sents is not None:
+             self.train_textrank(sents)
+ 
+     def train_textrank(self, sents, bias=None):
+         """
+         Arguments
+         ---------
+         sents : list of str
+             Sentence list
+         bias : None or numpy.ndarray
+             PageRank bias term
+         Returns
+         -------
+         None
+         """
+ 
+         g, self.idx_to_vocab = word_graph(sents,
+             self.tokenize, self.min_count,self.window,
+             self.min_cooccurrence, self.vocab_to_idx, self.verbose)
+         self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
+         if self.verbose:
+             print('trained TextRank. n words = {}'.format(self.R.shape[0]))
+ 
+     def keywords(self, topk=30):
+         """
+         Arguments
+         ---------
+         topk : int
+             Number of keywords selected from TextRank
+         Returns
+         -------
+         keywords : list of tuple
+             Each tuple stands for (word, rank)
+         """
+         if not hasattr(self, 'R'):
+             raise RuntimeError('Train textrank first or use summarize function')
+         idxs = self.R.argsort()[-topk:]
+         keywords = [(self.idx_to_vocab[idx], self.R[idx]) for idx in reversed(idxs)]
+         return keywords
+ 
+     def summarize(self, sents, topk=30):
+         """
+         Arguments
+         ---------
+         sents : list of str
+             Sentence list
+         topk : int
+             Number of keywords selected from TextRank
+         Returns
+         -------
+         keywords : list of tuple
+             Each tuple stands for (word, rank)
+         """
+         self.train_textrank(sents)
+         return self.keywords(topk)
+ 
+ 
+ class KeysentenceSummarizer:
+     """
+     Arguments
+     ---------
+     sents : list of str
+         Sentence list
+     tokenize : callable
+         Tokenize function: tokenize(str) = list of str
+     min_count : int
+         Minumum frequency of words will be used to construct sentence graph
+     min_sim : float
+         Minimum similarity between sentences in sentence graph
+     similarity : str
+         available similarity = ['cosine', 'textrank']
+     vocab_to_idx : dict or None
+         Vocabulary to index mapper
+     df : float
+         PageRank damping factor
+     max_iter : int
+         Number of PageRank iterations
+     verbose : Boolean
+         If True, it shows training progress
+     """
+     def __init__(self, sents=None, tokenize=None, min_count=2,
+         min_sim=0.3, similarity=None, vocab_to_idx=None,
+         df=0.85, max_iter=30, verbose=False):
+ 
+         self.tokenize = tokenize
+         self.min_count = min_count
+         self.min_sim = min_sim
+         self.similarity = similarity
+         self.vocab_to_idx = vocab_to_idx
+         self.df = df
+         self.max_iter = max_iter
+         self.verbose = verbose
+ 
+         if sents is not None:
+             self.train_textrank(sents)
+ 
+     def train_textrank(self, sents, bias=None):
+         """
+         Arguments
+         ---------
+         sents : list of str
+             Sentence list
+         bias : None or numpy.ndarray
+             PageRank bias term
+             Shape must be (n_sents,)
+         Returns
+         -------
+         None
+         """
+         g = sent_graph(sents, self.tokenize, self.min_count,
+             self.min_sim, self.similarity, self.vocab_to_idx, self.verbose)
+         self.R = pagerank(g, self.df, self.max_iter, bias).reshape(-1)
+         if self.verbose:
+             print('trained TextRank. n sentences = {}'.format(self.R.shape[0]))
+ 
+     def summarize(self, sents, topk=30, bias=None):
+         """
+         Arguments
+         ---------
+         sents : list of str
+             Sentence list
+         topk : int
+             Number of key-sentences to be selected.
+         bias : None or numpy.ndarray
+             PageRank bias term
+             Shape must be (n_sents,)
+         Returns
+         -------
+         keysents : list of tuple
+             Each tuple stands for (sentence index, rank, sentence)
+         Usage
+         -----
+             >>> from textrank import KeysentenceSummarizer
+             >>> summarizer = KeysentenceSummarizer(tokenize = tokenizer, min_sim = 0.5)
+             >>> keysents = summarizer.summarize(texts, topk=30)
+         """
+         n_sents = len(sents)
+         if isinstance(bias, np.ndarray):
+             if bias.shape != (n_sents,):
+                 raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
+         elif bias is not None:
+             raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
+ 
+         self.train_textrank(sents, bias)
+         idxs = self.R.argsort()[-topk:]
+         keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]
+         return keysents
\ No newline at end of file