subtitle.py
1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from pytube import YouTube
from PyKomoran import *
from xml.etree import ElementTree
from textrank import KeywordSummarizer
from textrank import KeysentenceSummarizer
import numpy as np
from operator import itemgetter
def komoran_tokenize(sent):
words = sent.split()
words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
return words
def subTitle(video_url, topk_size):
#youtube url의 자막 -> xml으로 가져오기
yt = YouTube(video_url)
title = yt.title
description = yt.description
caption = yt.captions.get_by_language_code('ko')
caption_xml = caption.xml_captions
#xml -> string list로 파싱
root = ElementTree.fromstring(caption_xml)
texts = []
texts.append(title)
for child in root.findall("text"):
text = child.text.replace('\n', ' ')
texts.append(text)
topk_size = len(texts) * topk_size // 100
#Komoran을 통해 형태소 단위로 분리 후 태깅
komoran = Komoran('STABLE')
sents = []
for text in texts:
tokened_text = komoran.get_plain_text(text)
sents.append(tokened_text)
keyword_extractor = KeywordSummarizer(
tokenize = komoran_tokenize,
window = -1,
verbose = False
)
keywords = keyword_extractor.summarize(sents, topk=30)
summarizer = KeysentenceSummarizer(
tokenize = lambda x:x.split(),
min_sim = 0.5,
verbose = False
)
bias = np.ones(len(texts))
bias[0] = 5
keysents = summarizer.summarize(texts, topk=topk_size, bias=bias)
keysents.sort(key=itemgetter(0))
First = True
ret1 = ''
ret2 = ''
for _, _, sent in keysents:
sent = sent.replace(''', "'")
sent = sent.replace('"', "'")
if First:
ret1 = sent
First = False
else:
ret2 = ret2 + sent + ' '
return [ret1, ret2]