GyuhoLee

[Update] subtitle 메소드화

...@@ -19,7 +19,9 @@ ...@@ -19,7 +19,9 @@
19 <select /> 19 <select />
20 </component> 20 </component>
21 <component name="ChangeListManager"> 21 <component name="ChangeListManager">
22 - <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" /> 22 + <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="">
23 + <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" />
24 + </list>
23 <option name="SHOW_DIALOG" value="false" /> 25 <option name="SHOW_DIALOG" value="false" />
24 <option name="HIGHLIGHT_CONFLICTS" value="true" /> 26 <option name="HIGHLIGHT_CONFLICTS" value="true" />
25 <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> 27 <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
......
...@@ -11,47 +11,56 @@ def komoran_tokenize(sent): ...@@ -11,47 +11,56 @@ def komoran_tokenize(sent):
11 words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)] 11 words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
12 return words 12 return words
13 13
14 -#youtube url의 자막 -> xml으로 가져오기 14 +def subtitle(video_url, topk_size):
15 -topk_size = 30 15 + #youtube url의 자막 -> xml으로 가져오기
16 -video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318' 16 + topk_size = 30
17 -yt = YouTube(video_url) 17 + video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
18 -title = yt.title 18 + yt = YouTube(video_url)
19 -description = yt.description 19 + title = yt.title
20 -caption = yt.captions.get_by_language_code('ko') 20 + description = yt.description
21 -caption_xml = caption.xml_captions 21 + caption = yt.captions.get_by_language_code('ko')
22 + caption_xml = caption.xml_captions
22 23
23 -#xml -> string list로 파싱 24 + #xml -> string list로 파싱
24 -root = ElementTree.fromstring(caption_xml) 25 + root = ElementTree.fromstring(caption_xml)
25 -texts = [] 26 + texts = []
26 -texts.append(title) 27 + texts.append(title)
27 -for child in root.findall("text"): 28 + for child in root.findall("text"):
28 - text = child.text.replace('\n', ' ') 29 + text = child.text.replace('\n', ' ')
29 - texts.append(text) 30 + texts.append(text)
30 -topk_size = texts.size() * 100 // topk_size 31 + topk_size = texts.size() * 100 // topk_size
31 32
32 -#Komoran을 통해 형태소 단위로 분리 후 태깅 33 + #Komoran을 통해 형태소 단위로 분리 후 태깅
33 -komoran = Komoran('STABLE') 34 + komoran = Komoran('STABLE')
34 -sents = [] 35 + sents = []
35 -for text in texts: 36 + for text in texts:
36 - tokened_text = komoran.get_plain_text(text) 37 + tokened_text = komoran.get_plain_text(text)
37 - sents.append(tokened_text) 38 + sents.append(tokened_text)
38 39
39 -keyword_extractor = KeywordSummarizer( 40 + keyword_extractor = KeywordSummarizer(
40 - tokenize = komoran_tokenize, 41 + tokenize = komoran_tokenize,
41 - window = -1, 42 + window = -1,
42 - verbose = False 43 + verbose = False
43 -) 44 + )
44 -keywords = keyword_extractor.summarize(sents, topk=30) 45 + keywords = keyword_extractor.summarize(sents, topk=30)
45 46
46 -summarizer = KeysentenceSummarizer( 47 + summarizer = KeysentenceSummarizer(
47 - tokenize = lambda x:x.split(), 48 + tokenize = lambda x:x.split(),
48 - min_sim = 0.5, 49 + min_sim = 0.5,
49 - verbose = False 50 + verbose = False
50 -) 51 + )
51 -bias = np.ones(len(texts)) 52 + bias = np.ones(len(texts))
52 -bias[0] = 5 53 + bias[0] = 5
53 -keysents = summarizer.summarize(texts, topk=topk_size, bias=bias) 54 + keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
54 -keysents.sort(key=itemgetter(0)) 55 + keysents.sort(key=itemgetter(0))
55 -for _, _, sent in keysents: 56 + first = True
56 - sent = sent.replace('&#39;', "'") 57 + ret = ''
57 - print(sent) 58 + for _, _, sent in keysents:
59 + sent = sent.replace('&#39;', "'")
60 + ret = ret + sent
61 + if first:
62 + ret += '\n'
63 + first = False
64 + else:
65 + ret += ' '
66 + return ret;
......