Showing
2 changed files
with
41 additions
and
30 deletions
... | @@ -19,7 +19,9 @@ | ... | @@ -19,7 +19,9 @@ |
19 | <select /> | 19 | <select /> |
20 | </component> | 20 | </component> |
21 | <component name="ChangeListManager"> | 21 | <component name="ChangeListManager"> |
22 | - <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" /> | 22 | + <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment=""> |
23 | + <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" /> | ||
24 | + </list> | ||
23 | <option name="SHOW_DIALOG" value="false" /> | 25 | <option name="SHOW_DIALOG" value="false" /> |
24 | <option name="HIGHLIGHT_CONFLICTS" value="true" /> | 26 | <option name="HIGHLIGHT_CONFLICTS" value="true" /> |
25 | <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> | 27 | <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> | ... | ... |
... | @@ -11,47 +11,56 @@ def komoran_tokenize(sent): | ... | @@ -11,47 +11,56 @@ def komoran_tokenize(sent): |
11 | words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)] | 11 | words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)] |
12 | return words | 12 | return words |
13 | 13 | ||
14 | -#youtube url의 자막 -> xml으로 가져오기 | 14 | +def subtitle(video_url, topk_size): |
15 | -topk_size = 30 | 15 | + #youtube url의 자막 -> xml으로 가져오기 |
16 | -video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' | 16 | + topk_size = 30 |
17 | -yt = YouTube(video_url) | 17 | + video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' |
18 | -title = yt.title | 18 | + yt = YouTube(video_url) |
19 | -description = yt.description | 19 | + title = yt.title |
20 | -caption = yt.captions.get_by_language_code('ko') | 20 | + description = yt.description |
21 | -caption_xml = caption.xml_captions | 21 | + caption = yt.captions.get_by_language_code('ko') |
22 | + caption_xml = caption.xml_captions | ||
22 | 23 | ||
23 | -#xml -> string list로 파싱 | 24 | + #xml -> string list로 파싱 |
24 | -root = ElementTree.fromstring(caption_xml) | 25 | + root = ElementTree.fromstring(caption_xml) |
25 | -texts = [] | 26 | + texts = [] |
26 | -texts.append(title) | 27 | + texts.append(title) |
27 | -for child in root.findall("text"): | 28 | + for child in root.findall("text"): |
28 | text = child.text.replace('\n', ' ') | 29 | text = child.text.replace('\n', ' ') |
29 | texts.append(text) | 30 | texts.append(text) |
30 | -topk_size = texts.size() * 100 // topk_size | 31 | + topk_size = texts.size() * 100 // topk_size |
31 | 32 | ||
32 | -#Komoran을 통해 형태소 단위로 분리 후 태깅 | 33 | + #Komoran을 통해 형태소 단위로 분리 후 태깅 |
33 | -komoran = Komoran('STABLE') | 34 | + komoran = Komoran('STABLE') |
34 | -sents = [] | 35 | + sents = [] |
35 | -for text in texts: | 36 | + for text in texts: |
36 | tokened_text = komoran.get_plain_text(text) | 37 | tokened_text = komoran.get_plain_text(text) |
37 | sents.append(tokened_text) | 38 | sents.append(tokened_text) |
38 | 39 | ||
39 | -keyword_extractor = KeywordSummarizer( | 40 | + keyword_extractor = KeywordSummarizer( |
40 | tokenize = komoran_tokenize, | 41 | tokenize = komoran_tokenize, |
41 | window = -1, | 42 | window = -1, |
42 | verbose = False | 43 | verbose = False |
43 | -) | 44 | + ) |
44 | -keywords = keyword_extractor.summarize(sents, topk=30) | 45 | + keywords = keyword_extractor.summarize(sents, topk=30) |
45 | 46 | ||
46 | -summarizer = KeysentenceSummarizer( | 47 | + summarizer = KeysentenceSummarizer( |
47 | tokenize = lambda x:x.split(), | 48 | tokenize = lambda x:x.split(), |
48 | min_sim = 0.5, | 49 | min_sim = 0.5, |
49 | verbose = False | 50 | verbose = False |
50 | -) | 51 | + ) |
51 | -bias = np.ones(len(texts)) | 52 | + bias = np.ones(len(texts)) |
52 | -bias[0] = 5 | 53 | + bias[0] = 5 |
53 | -keysents = summarizer.summarize(texts, topk=topk_size, bias=bias) | 54 | + keysents = summarizer.summarize(texts, topk=topk_size, bias=bias) |
54 | -keysents.sort(key=itemgetter(0)) | 55 | + keysents.sort(key=itemgetter(0)) |
55 | -for _, _, sent in keysents: | 56 | + first = True |
57 | + ret = '' | ||
58 | + for _, _, sent in keysents: | ||
56 | sent = sent.replace(''', "'") | 59 | sent = sent.replace(''', "'") |
57 | - print(sent) | 60 | + ret = ret + sent |
61 | + if first: | ||
62 | + ret += '\n' | ||
63 | + first = False | ||
64 | + else: | ||
65 | + ret += ' ' | ||
66 | + return ret; | ... | ... |
-
Please register or login to post a comment