Showing
9 changed files
with
61 additions
and
15 deletions
| ... | @@ -19,7 +19,11 @@ | ... | @@ -19,7 +19,11 @@ |
| 19 | <select /> | 19 | <select /> |
| 20 | </component> | 20 | </component> |
| 21 | <component name="ChangeListManager"> | 21 | <component name="ChangeListManager"> |
| 22 | - <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" /> | 22 | + <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment=""> |
| 23 | + <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> | ||
| 24 | + <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" /> | ||
| 25 | + <change beforePath="$PROJECT_DIR$/textrank/summarizer.py" beforeDir="false" afterPath="$PROJECT_DIR$/textrank/summarizer.py" afterDir="false" /> | ||
| 26 | + </list> | ||
| 23 | <option name="SHOW_DIALOG" value="false" /> | 27 | <option name="SHOW_DIALOG" value="false" /> |
| 24 | <option name="HIGHLIGHT_CONFLICTS" value="true" /> | 28 | <option name="HIGHLIGHT_CONFLICTS" value="true" /> |
| 25 | <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> | 29 | <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> |
| ... | @@ -103,21 +107,25 @@ | ... | @@ -103,21 +107,25 @@ |
| 103 | <screen x="0" y="0" width="1920" height="1040" /> | 107 | <screen x="0" y="0" width="1920" height="1040" /> |
| 104 | </state> | 108 | </state> |
| 105 | <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" /> | 109 | <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" /> |
| 106 | - <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1604306110978"> | 110 | + <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1607070330004"> |
| 107 | - <screen x="1920" y="0" width="1920" height="1040" /> | 111 | + <screen x="0" y="0" width="1920" height="1040" /> |
| 108 | </state> | 112 | </state> |
| 113 | + <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
| 109 | <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 114 | <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
| 110 | - <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1604306110978"> | 115 | + <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1607070330004"> |
| 111 | - <screen x="1920" y="0" width="1920" height="1040" /> | 116 | + <screen x="0" y="0" width="1920" height="1040" /> |
| 112 | </state> | 117 | </state> |
| 118 | + <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
| 113 | <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 119 | <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
| 114 | - <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1604306110978"> | 120 | + <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1607070330004"> |
| 115 | - <screen x="1920" y="0" width="1920" height="1040" /> | 121 | + <screen x="0" y="0" width="1920" height="1040" /> |
| 116 | </state> | 122 | </state> |
| 123 | + <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
| 117 | <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 124 | <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
| 118 | - <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1604306110978"> | 125 | + <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1607070330004"> |
| 119 | - <screen x="1920" y="0" width="1920" height="1040" /> | 126 | + <screen x="0" y="0" width="1920" height="1040" /> |
| 120 | </state> | 127 | </state> |
| 128 | + <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
| 121 | <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 129 | <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
| 122 | <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485"> | 130 | <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485"> |
| 123 | <screen x="1920" y="0" width="1920" height="1040" /> | 131 | <screen x="1920" y="0" width="1920" height="1040" /> | ... | ... |
| 1 | from pytube import YouTube | 1 | from pytube import YouTube |
| 2 | +from PyKomoran import * | ||
| 2 | from xml.etree import ElementTree | 3 | from xml.etree import ElementTree |
| 4 | +from textrank import KeywordSummarizer | ||
| 5 | +from textrank import KeysentenceSummarizer | ||
| 6 | +import numpy as np | ||
| 7 | +from operator import itemgetter | ||
| 8 | + | ||
| 9 | +def komoran_tokenize(sent): | ||
| 10 | + words = sent.split() | ||
| 11 | + words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)] | ||
| 12 | + return words | ||
| 3 | 13 | ||
| 4 | #youtube url의 자막 -> xml으로 가져오기 | 14 | #youtube url의 자막 -> xml으로 가져오기 |
| 15 | +topk_size = 30 | ||
| 5 | video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' | 16 | video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' |
| 6 | yt = YouTube(video_url) | 17 | yt = YouTube(video_url) |
| 7 | title = yt.title | 18 | title = yt.title |
| ... | @@ -9,10 +20,38 @@ description = yt.description | ... | @@ -9,10 +20,38 @@ description = yt.description |
| 9 | caption = yt.captions.get_by_language_code('ko') | 20 | caption = yt.captions.get_by_language_code('ko') |
| 10 | caption_xml = caption.xml_captions | 21 | caption_xml = caption.xml_captions |
| 11 | 22 | ||
| 12 | -#xml -> string list로 파싱(문장별) | 23 | +#xml -> string list로 파싱 |
| 13 | root = ElementTree.fromstring(caption_xml) | 24 | root = ElementTree.fromstring(caption_xml) |
| 14 | -sentences = [] | 25 | +texts = [] |
| 15 | -print(root.tag, root.attrib) | 26 | +texts.append(title) |
| 16 | for child in root.findall("text"): | 27 | for child in root.findall("text"): |
| 17 | - sentences.append(child.text.replace('\n', ' ')) | ||
| 18 | -print(sentences) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 28 | + text = child.text.replace('\n', ' ') | ||
| 29 | + texts.append(text) | ||
| 30 | +topk_size = texts.size() * 100 // topk_size | ||
| 31 | + | ||
| 32 | +#Komoran을 통해 형태소 단위로 분리 후 태깅 | ||
| 33 | +komoran = Komoran('STABLE') | ||
| 34 | +sents = [] | ||
| 35 | +for text in texts: | ||
| 36 | + tokened_text = komoran.get_plain_text(text) | ||
| 37 | + sents.append(tokened_text) | ||
| 38 | + | ||
| 39 | +keyword_extractor = KeywordSummarizer( | ||
| 40 | + tokenize = komoran_tokenize, | ||
| 41 | + window = -1, | ||
| 42 | + verbose = False | ||
| 43 | +) | ||
| 44 | +keywords = keyword_extractor.summarize(sents, topk=30) | ||
| 45 | + | ||
| 46 | +summarizer = KeysentenceSummarizer( | ||
| 47 | + tokenize = lambda x:x.split(), | ||
| 48 | + min_sim = 0.5, | ||
| 49 | + verbose = False | ||
| 50 | +) | ||
| 51 | +bias = np.ones(len(texts)) | ||
| 52 | +bias[0] = 5 | ||
| 53 | +keysents = summarizer.summarize(texts, topk=topk_size, bias=bias) | ||
| 54 | +keysents.sort(key=itemgetter(0)) | ||
| 55 | +for _, _, sent in keysents: | ||
| 56 | + sent = sent.replace(''', "'") | ||
| 57 | + print(sent) | ... | ... |
No preview for this file type
src/textrank/__pycache__/rank.cpython-38.pyc
0 → 100644
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
src/textrank/__pycache__/word.cpython-38.pyc
0 → 100644
No preview for this file type
| ... | @@ -183,7 +183,6 @@ class KeysentenceSummarizer: | ... | @@ -183,7 +183,6 @@ class KeysentenceSummarizer: |
| 183 | raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) | 183 | raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) |
| 184 | elif bias is not None: | 184 | elif bias is not None: |
| 185 | raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) | 185 | raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) |
| 186 | - | ||
| 187 | self.train_textrank(sents, bias) | 186 | self.train_textrank(sents, bias) |
| 188 | idxs = self.R.argsort()[-topk:] | 187 | idxs = self.R.argsort()[-topk:] |
| 189 | keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | 188 | keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | ... | ... |
-
Please register or login to post a comment