Showing
9 changed files
with
61 additions
and
15 deletions
... | @@ -19,7 +19,11 @@ | ... | @@ -19,7 +19,11 @@ |
19 | <select /> | 19 | <select /> |
20 | </component> | 20 | </component> |
21 | <component name="ChangeListManager"> | 21 | <component name="ChangeListManager"> |
22 | - <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" /> | 22 | + <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment=""> |
23 | + <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> | ||
24 | + <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" /> | ||
25 | + <change beforePath="$PROJECT_DIR$/textrank/summarizer.py" beforeDir="false" afterPath="$PROJECT_DIR$/textrank/summarizer.py" afterDir="false" /> | ||
26 | + </list> | ||
23 | <option name="SHOW_DIALOG" value="false" /> | 27 | <option name="SHOW_DIALOG" value="false" /> |
24 | <option name="HIGHLIGHT_CONFLICTS" value="true" /> | 28 | <option name="HIGHLIGHT_CONFLICTS" value="true" /> |
25 | <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> | 29 | <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> |
... | @@ -103,21 +107,25 @@ | ... | @@ -103,21 +107,25 @@ |
103 | <screen x="0" y="0" width="1920" height="1040" /> | 107 | <screen x="0" y="0" width="1920" height="1040" /> |
104 | </state> | 108 | </state> |
105 | <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" /> | 109 | <state x="740" y="275" key="FileChooserDialogImpl/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1604836455343" /> |
106 | - <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1604306110978"> | 110 | + <state width="1899" height="282" key="GridCell.Tab.0.bottom" timestamp="1607070330004"> |
107 | - <screen x="1920" y="0" width="1920" height="1040" /> | 111 | + <screen x="0" y="0" width="1920" height="1040" /> |
108 | </state> | 112 | </state> |
113 | + <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
109 | <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 114 | <state width="1899" height="282" key="GridCell.Tab.0.bottom/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
110 | - <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1604306110978"> | 115 | + <state width="1899" height="282" key="GridCell.Tab.0.center" timestamp="1607070330004"> |
111 | - <screen x="1920" y="0" width="1920" height="1040" /> | 116 | + <screen x="0" y="0" width="1920" height="1040" /> |
112 | </state> | 117 | </state> |
118 | + <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
113 | <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 119 | <state width="1899" height="282" key="GridCell.Tab.0.center/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
114 | - <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1604306110978"> | 120 | + <state width="1899" height="282" key="GridCell.Tab.0.left" timestamp="1607070330004"> |
115 | - <screen x="1920" y="0" width="1920" height="1040" /> | 121 | + <screen x="0" y="0" width="1920" height="1040" /> |
116 | </state> | 122 | </state> |
123 | + <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
117 | <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 124 | <state width="1899" height="282" key="GridCell.Tab.0.left/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
118 | - <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1604306110978"> | 125 | + <state width="1899" height="282" key="GridCell.Tab.0.right" timestamp="1607070330004"> |
119 | - <screen x="1920" y="0" width="1920" height="1040" /> | 126 | + <screen x="0" y="0" width="1920" height="1040" /> |
120 | </state> | 127 | </state> |
128 | + <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@0.0.1920.1040" timestamp="1607070330004" /> | ||
121 | <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> | 129 | <state width="1899" height="282" key="GridCell.Tab.0.right/0.0.1920.1040/1920.0.1920.1040@1920.0.1920.1040" timestamp="1604306110978" /> |
122 | <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485"> | 130 | <state x="2381" y="164" key="SettingsEditor" timestamp="1604303734485"> |
123 | <screen x="1920" y="0" width="1920" height="1040" /> | 131 | <screen x="1920" y="0" width="1920" height="1040" /> | ... | ... |
1 | from pytube import YouTube | 1 | from pytube import YouTube |
2 | +from PyKomoran import * | ||
2 | from xml.etree import ElementTree | 3 | from xml.etree import ElementTree |
4 | +from textrank import KeywordSummarizer | ||
5 | +from textrank import KeysentenceSummarizer | ||
6 | +import numpy as np | ||
7 | +from operator import itemgetter | ||
8 | + | ||
9 | +def komoran_tokenize(sent): | ||
10 | + words = sent.split() | ||
11 | + words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)] | ||
12 | + return words | ||
3 | 13 | ||
4 | #youtube url의 자막 -> xml으로 가져오기 | 14 | #youtube url의 자막 -> xml으로 가져오기 |
15 | +topk_size = 30 | ||
5 | video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' | 16 | video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' |
6 | yt = YouTube(video_url) | 17 | yt = YouTube(video_url) |
7 | title = yt.title | 18 | title = yt.title |
... | @@ -9,10 +20,38 @@ description = yt.description | ... | @@ -9,10 +20,38 @@ description = yt.description |
9 | caption = yt.captions.get_by_language_code('ko') | 20 | caption = yt.captions.get_by_language_code('ko') |
10 | caption_xml = caption.xml_captions | 21 | caption_xml = caption.xml_captions |
11 | 22 | ||
12 | -#xml -> string list로 파싱(문장별) | 23 | +#xml -> string list로 파싱 |
13 | root = ElementTree.fromstring(caption_xml) | 24 | root = ElementTree.fromstring(caption_xml) |
14 | -sentences = [] | 25 | +texts = [] |
15 | -print(root.tag, root.attrib) | 26 | +texts.append(title) |
16 | for child in root.findall("text"): | 27 | for child in root.findall("text"): |
17 | - sentences.append(child.text.replace('\n', ' ')) | ||
18 | -print(sentences) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
28 | + text = child.text.replace('\n', ' ') | ||
29 | + texts.append(text) | ||
30 | +topk_size = texts.size() * 100 // topk_size | ||
31 | + | ||
32 | +#Komoran을 통해 형태소 단위로 분리 후 태깅 | ||
33 | +komoran = Komoran('STABLE') | ||
34 | +sents = [] | ||
35 | +for text in texts: | ||
36 | + tokened_text = komoran.get_plain_text(text) | ||
37 | + sents.append(tokened_text) | ||
38 | + | ||
39 | +keyword_extractor = KeywordSummarizer( | ||
40 | + tokenize = komoran_tokenize, | ||
41 | + window = -1, | ||
42 | + verbose = False | ||
43 | +) | ||
44 | +keywords = keyword_extractor.summarize(sents, topk=30) | ||
45 | + | ||
46 | +summarizer = KeysentenceSummarizer( | ||
47 | + tokenize = lambda x:x.split(), | ||
48 | + min_sim = 0.5, | ||
49 | + verbose = False | ||
50 | +) | ||
51 | +bias = np.ones(len(texts)) | ||
52 | +bias[0] = 5 | ||
53 | +keysents = summarizer.summarize(texts, topk=topk_size, bias=bias) | ||
54 | +keysents.sort(key=itemgetter(0)) | ||
55 | +for _, _, sent in keysents: | ||
56 | + sent = sent.replace(''', "'") | ||
57 | + print(sent) | ... | ... |
No preview for this file type
src/textrank/__pycache__/rank.cpython-38.pyc
0 → 100644
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
src/textrank/__pycache__/word.cpython-38.pyc
0 → 100644
No preview for this file type
... | @@ -183,7 +183,6 @@ class KeysentenceSummarizer: | ... | @@ -183,7 +183,6 @@ class KeysentenceSummarizer: |
183 | raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) | 183 | raise ValueError('The shape of bias must be (n_sents,) but {}'.format(bias.shape)) |
184 | elif bias is not None: | 184 | elif bias is not None: |
185 | raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) | 185 | raise ValueError('The type of bias must be None or numpy.ndarray but the type is {}'.format(type(bias))) |
186 | - | ||
187 | self.train_textrank(sents, bias) | 186 | self.train_textrank(sents, bias) |
188 | idxs = self.R.argsort()[-topk:] | 187 | idxs = self.R.argsort()[-topk:] |
189 | keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | 188 | keysents = [(idx, self.R[idx], sents[idx]) for idx in reversed(idxs)] | ... | ... |
-
Please register or login to post a comment