[Update] 자막 요약 알고리즘 구현 완료

GyuhoLee
Commit 2d59cf8142fbd045f0980157edd36e942a530608 2d59cf81 1 parent 418496ff
Showing 9 changed files with 61 additions and 15 deletions
src/.idea/workspace.xml
src/subtitle.py
src/textrank/__pycache__/__init__.cpython-38.pyc
src/textrank/__pycache__/rank.cpython-38.pyc
src/textrank/__pycache__/sentence.cpython-38.pyc
src/textrank/__pycache__/summarizer.cpython-38.pyc
src/textrank/__pycache__/utils.cpython-38.pyc
src/textrank/__pycache__/word.cpython-38.pyc
src/textrank/summarizer.py
--- a/src/.idea/workspace.xml
View file @2d59cf8
+++ b/src/.idea/workspace.xml
View file @2d59cf8
@@ -19,7 +19,11 @@
     <select />
   </component>
   <component name="ChangeListManager">
-     <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" />
+     <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="">
+       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+       <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" />
+       <change beforePath="$PROJECT_DIR$/textrank/summarizer.py" beforeDir="false" afterPath="$PROJECT_DIR$/textrank/summarizer.py" afterDir="false" />
+     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -103,21 +107,25 @@
       <screen x="0" y="0" width="1920" height="1040" />
     </state>
     <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" />
-     <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1604306110978">
-       <screen x="1920" y="0" width="1920" height="1040" />
+     <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1607070330004">
+       <screen x="0" y="0" width="1920" height="1040" />
     </state>
+     <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
     <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
-     <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1604306110978">
-       <screen x="1920" y="0" width="1920" height="1040" />
+     <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1607070330004">
+       <screen x="0" y="0" width="1920" height="1040" />
     </state>
+     <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
     <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
-     <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1604306110978">
-       <screen x="1920" y="0" width="1920" height="1040" />
+     <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1607070330004">
+       <screen x="0" y="0" width="1920" height="1040" />
     </state>
+     <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
     <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
-     <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1604306110978">
-       <screen x="1920" y="0" width="1920" height="1040" />
+     <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1607070330004">
+       <screen x="0" y="0" width="1920" height="1040" />
     </state>
+     <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" />
     <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" />
     <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485">
       <screen x="1920" y="0" width="1920" height="1040" />
--- a/src/subtitle.py
View file @2d59cf8
+++ b/src/subtitle.py
View file @2d59cf8
 from pytube import YouTube
+ from PyKomoran import *
 from xml.etree import ElementTree
+ from textrank import KeywordSummarizer
+ from textrank import KeysentenceSummarizer
+ import numpy as np
+ from operator import itemgetter
+ 
+ def komoran_tokenize(sent):
+     words = sent.split()
+     words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
+     return words
 
 #youtube url의 자막 -> xml으로 가져오기
+ topk_size = 30
 video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
 yt = YouTube(video_url)
 title = yt.title
@@ -9,10 +20,38 @@ description = yt.description
 caption = yt.captions.get_by_language_code('ko')
 caption_xml = caption.xml_captions
 
- #xml -> string list로 파싱(문장별)
+ #xml -> string list로 파싱
 root = ElementTree.fromstring(caption_xml)
- sentences = []
- print(root.tag, root.attrib)
+ texts = []
+ texts.append(title)
 for child in root.findall("text"):
-     sentences.append(child.text.replace('\n', ' '))
- print(sentences)
\ No newline at end of file
+     text = child.text.replace('\n', ' ')
+     texts.append(text)
+ topk_size = texts.size() * 100 // topk_size
+ 
+ #Komoran을 통해 형태소 단위로 분리 후 태깅
+ komoran = Komoran('STABLE')
+ sents = []
+ for text in texts:
+     tokened_text = komoran.get_plain_text(text)
+     sents.append(tokened_text)
+ 
+ keyword_extractor = KeywordSummarizer(
+     tokenize = komoran_tokenize,
+     window = -1,
+     verbose = False
+ )
+ keywords = keyword_extractor.summarize(sents, topk=30)
+ 
+ summarizer = KeysentenceSummarizer(
+     tokenize = lambda x:x.split(),
+     min_sim = 0.5,
+     verbose = False
+ )
+ bias = np.ones(len(texts))
+ bias[0] = 5
+ keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
+ keysents.sort(key=itemgetter(0))
+ for _, _, sent in keysents:
+     sent = sent.replace('&#39;', "'")
+     print(sent)
--- a/src/textrank/__pycache__/__init__.cpython-38.pyc 0 → 100644
View file @2d59cf8
+++ b/src/textrank/__pycache__/__init__.cpython-38.pyc 0 → 100644
View file @2d59cf8
--- a/src/textrank/__pycache__/rank.cpython-38.pyc 0 → 100644
View file @2d59cf8
+++ b/src/textrank/__pycache__/rank.cpython-38.pyc 0 → 100644
View file @2d59cf8
--- a/src/textrank/__pycache__/sentence.cpython-38.pyc 0 → 100644
View file @2d59cf8
+++ b/src/textrank/__pycache__/sentence.cpython-38.pyc 0 → 100644
View file @2d59cf8
--- a/src/textrank/__pycache__/summarizer.cpython-38.pyc 0 → 100644
View file @2d59cf8
+++ b/src/textrank/__pycache__/summarizer.cpython-38.pyc 0 → 100644
View file @2d59cf8
--- a/src/textrank/__pycache__/utils.cpython-38.pyc 0 → 100644
View file @2d59cf8
+++ b/src/textrank/__pycache__/utils.cpython-38.pyc 0 → 100644
View file @2d59cf8
--- a/src/textrank/__pycache__/word.cpython-38.pyc 0 → 100644
View file @2d59cf8
+++ b/src/textrank/__pycache__/word.cpython-38.pyc 0 → 100644
View file @2d59cf8
--- a/src/textrank/summarizer.py
View file @2d59cf8
+++ b/src/textrank/summarizer.py
View file @2d59cf8
@@ -183,7 +183,6 @@ class KeysentenceSummarizer:
                 raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape))
         elif bias is not None:
             raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias)))
- 
         self.train_textrank(sents, bias)
         idxs = self.R.argsort()[-topk:]
         keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)]