GyuhoLee

[Update] subtitle 메소드화

......@@ -19,7 +19,9 @@
<select />
</component>
<component name="ChangeListManager">
<list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="" />
<list default="true" id="b9decb0c-dc9e-4239-bdad-09ea8dd5179d" name="Default Changelist" comment="">
<change beforePath="$PROJECT_DIR$/subtitle.py" beforeDir="false" afterPath="$PROJECT_DIR$/subtitle.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
......
......@@ -11,47 +11,56 @@ def komoran_tokenize(sent):
words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
return words
#youtube url의 자막 -> xml으로 가져오기
topk_size = 30
video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
yt = YouTube(video_url)
title = yt.title
description = yt.description
caption = yt.captions.get_by_language_code('ko')
caption_xml = caption.xml_captions
def subtitle(video_url, topk_size):
#youtube url의 자막 -> xml으로 가져오기
topk_size = 30
video_url = 'https://www.youtube.com/watch?v=ecUWKU_v318'
yt = YouTube(video_url)
title = yt.title
description = yt.description
caption = yt.captions.get_by_language_code('ko')
caption_xml = caption.xml_captions
#xml -> string list로 파싱
root = ElementTree.fromstring(caption_xml)
texts = []
texts.append(title)
for child in root.findall("text"):
#xml -> string list로 파싱
root = ElementTree.fromstring(caption_xml)
texts = []
texts.append(title)
for child in root.findall("text"):
text = child.text.replace('\n', ' ')
texts.append(text)
topk_size = texts.size() * 100 // topk_size
topk_size = texts.size() * 100 // topk_size
#Komoran을 통해 형태소 단위로 분리 후 태깅
komoran = Komoran('STABLE')
sents = []
for text in texts:
#Komoran을 통해 형태소 단위로 분리 후 태깅
komoran = Komoran('STABLE')
sents = []
for text in texts:
tokened_text = komoran.get_plain_text(text)
sents.append(tokened_text)
keyword_extractor = KeywordSummarizer(
keyword_extractor = KeywordSummarizer(
tokenize = komoran_tokenize,
window = -1,
verbose = False
)
keywords = keyword_extractor.summarize(sents, topk=30)
)
keywords = keyword_extractor.summarize(sents, topk=30)
summarizer = KeysentenceSummarizer(
summarizer = KeysentenceSummarizer(
tokenize = lambda x:x.split(),
min_sim = 0.5,
verbose = False
)
bias = np.ones(len(texts))
bias[0] = 5
keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
keysents.sort(key=itemgetter(0))
for _, _, sent in keysents:
)
bias = np.ones(len(texts))
bias[0] = 5
keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
keysents.sort(key=itemgetter(0))
first = True
ret = ''
for _, _, sent in keysents:
sent = sent.replace('&#39;', "'")
print(sent)
ret = ret + sent
if first:
ret += '\n'
first = False
else:
ret += ' '
return ret;
......