정의동

Add: source lib 추가

# OS Generated files
*.DS_Store
*.DS_Store?
._*
\ No newline at end of file
**/__pycache__
......
from nltk.tokenize import word_tokenize
import nltk
import re
from bs4 import BeautifulSoup
import requests
def get_HTML_from_url(url):
return requests.get(url).text
def get_text_from_HTML(html):
soup = BeautifulSoup(html)
script_tag = soup.find_all(['script', 'style', 'header', 'footer', 'form'])
for script in script_tag:
script.extract()
content = soup.get_text('\n', strip=True)
return content
# def get_HTML_from_regexp_url(url_pattern):
def is_string(target):
return type(target) == str
def cut_corpus(corpus):
if not is_string(corpus):
return []
return corpus.split('.')[:-1]
def postag_sentence(sentence):
if not is_string(sentence):
return []
tags = word_tokenize(sentence)
return nltk.pos_tag(tags)
# verb의 index를 return 합니다.
# 만약, 존재하지 않는다면, -1을 return 합니다.
def find_verb_idx(tags):
idx = 0
for tag in tags:
if tag[0][1] == 'V':
return idx
return -1
def make_be_verb(subj):
if subj == 'I':
return 'am'
elif subj in ['You', 'you']:
return 'are'
else:
return 'is'
def cut_quot(sentence):
return re.sub("[\'\"\`]", '', sentence)
# 예외
# 1. brace가 닫히지 않음
# 2. target_str가 없음
def make_brace_triple(target_str, brace_tags):
if target_str == '':
return []
idx = find_verb_idx(brace_tags)
subj = target_str
pred = ''
if idx != -1:
pred = brace_tags[idx]
obj = ' '.join([value for value, _ in brace_tags[idx:]])
else:
pred = make_be_verb(subj)
obj = ' '.join([value for value, _ in brace_tags])
return [subj, pred, obj]