[Update] subtitle 메소드화

GyuhoLee
Commit bfc394330c3d7b141641da625dceefd4292b6ab4 bfc39433 1 parent e7624830
Showing 2 changed files with 52 additions and 41 deletions
src/.idea/workspace.xml
src/subtitle.py
--- a/src/.idea/workspace.xml
View file @bfc3943
+++ b/src/.idea/workspace.xml
View file @bfc3943
@@ -19,7 +19,9 @@
     <select />
   </component>
   <component name="ChangeListManager">
-     <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" />
+     <list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="">
+       <change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" />
+     </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
--- a/src/subtitle.py
View file @bfc3943
+++ b/src/subtitle.py
View file @bfc3943
@@ -11,47 +11,56 @@ def komoran_tokenize(sent):
     words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
     return words
 
- #youtube url의 자막 -> xml으로 가져오기
- topk_size = 30
- video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
- yt = YouTube(video_url)
- title = yt.title
- description = yt.description
- caption = yt.captions.get_by_language_code('ko')
- caption_xml = caption.xml_captions
+ def subtitle(video_url, topk_size):
+     #youtube url의 자막 -> xml으로 가져오기
+     topk_size = 30
+     video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
+     yt = YouTube(video_url)
+     title = yt.title
+     description = yt.description
+     caption = yt.captions.get_by_language_code('ko')
+     caption_xml = caption.xml_captions
 
- #xml -> string list로 파싱
- root = ElementTree.fromstring(caption_xml)
- texts = []
- texts.append(title)
- for child in root.findall("text"):
-     text = child.text.replace('\n', ' ')
-     texts.append(text)
- topk_size = texts.size() * 100 // topk_size
+     #xml -> string list로 파싱
+     root = ElementTree.fromstring(caption_xml)
+     texts = []
+     texts.append(title)
+     for child in root.findall("text"):
+         text = child.text.replace('\n', ' ')
+         texts.append(text)
+     topk_size = texts.size() * 100 // topk_size
 
- #Komoran을 통해 형태소 단위로 분리 후 태깅
- komoran = Komoran('STABLE')
- sents = []
- for text in texts:
-     tokened_text = komoran.get_plain_text(text)
-     sents.append(tokened_text)
+     #Komoran을 통해 형태소 단위로 분리 후 태깅
+     komoran = Komoran('STABLE')
+     sents = []
+     for text in texts:
+         tokened_text = komoran.get_plain_text(text)
+         sents.append(tokened_text)
 
- keyword_extractor = KeywordSummarizer(
-     tokenize = komoran_tokenize,
-     window = -1,
-     verbose = False
- )
- keywords = keyword_extractor.summarize(sents, topk=30)
+     keyword_extractor = KeywordSummarizer(
+         tokenize = komoran_tokenize,
+         window = -1,
+         verbose = False
+     )
+     keywords = keyword_extractor.summarize(sents, topk=30)
 
- summarizer = KeysentenceSummarizer(
-     tokenize = lambda x:x.split(),
-     min_sim = 0.5,
-     verbose = False
- )
- bias = np.ones(len(texts))
- bias[0] = 5
- keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
- keysents.sort(key=itemgetter(0))
- for _, _, sent in keysents:
-     sent = sent.replace('&#39;', "'")
-     print(sent)
+     summarizer = KeysentenceSummarizer(
+         tokenize = lambda x:x.split(),
+         min_sim = 0.5,
+         verbose = False
+     )
+     bias = np.ones(len(texts))
+     bias[0] = 5
+     keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
+     keysents.sort(key=itemgetter(0))
+     first = True
+     ret = ''
+     for _, _, sent in keysents:
+         sent = sent.replace('&#39;', "'")
+         ret = ret + sent
+         if first:
+             ret += '\n'
+             first = False
+         else:
+             ret += ' '
+     return ret;