Showing
2 changed files
with
81 additions
and
4 deletions
src/lib/util.py
0 → 100644
1 | +from nltk.tokenize import word_tokenize | ||
2 | +import nltk | ||
3 | +import re | ||
4 | +from bs4 import BeautifulSoup | ||
5 | +import requests | ||
6 | + | ||
7 | + | ||
8 | +def get_HTML_from_url(url): | ||
9 | + return requests.get(url).text | ||
10 | + | ||
11 | + | ||
12 | +def get_text_from_HTML(html): | ||
13 | + soup = BeautifulSoup(html) | ||
14 | + script_tag = soup.find_all(['script', 'style', 'header', 'footer', 'form']) | ||
15 | + | ||
16 | + for script in script_tag: | ||
17 | + script.extract() | ||
18 | + content = soup.get_text('\n', strip=True) | ||
19 | + return content | ||
20 | + | ||
21 | + | ||
22 | +# def get_HTML_from_regexp_url(url_pattern): | ||
23 | + | ||
24 | + | ||
25 | +def is_string(target): | ||
26 | + return type(target) == str | ||
27 | + | ||
28 | + | ||
29 | +def cut_corpus(corpus): | ||
30 | + if not is_string(corpus): | ||
31 | + return [] | ||
32 | + return corpus.split('.')[:-1] | ||
33 | + | ||
34 | + | ||
35 | +def postag_sentence(sentence): | ||
36 | + if not is_string(sentence): | ||
37 | + return [] | ||
38 | + tags = word_tokenize(sentence) | ||
39 | + return nltk.pos_tag(tags) | ||
40 | + | ||
41 | + | ||
42 | +# verb의 index를 return 합니다. | ||
43 | +# 만약, 존재하지 않는다면, -1을 return 합니다. | ||
44 | +def find_verb_idx(tags): | ||
45 | + idx = 0 | ||
46 | + for tag in tags: | ||
47 | + if tag[0][1] == 'V': | ||
48 | + return idx | ||
49 | + return -1 | ||
50 | + | ||
51 | + | ||
52 | +def make_be_verb(subj): | ||
53 | + if subj == 'I': | ||
54 | + return 'am' | ||
55 | + elif subj in ['You', 'you']: | ||
56 | + return 'are' | ||
57 | + else: | ||
58 | + return 'is' | ||
59 | + | ||
60 | + | ||
61 | +def cut_quot(sentence): | ||
62 | + return re.sub("[\'\"\`]", '', sentence) | ||
63 | + | ||
64 | + | ||
65 | +# 예외 | ||
66 | +# 1. brace가 닫히지 않음 | ||
67 | +# 2. target_str가 없음 | ||
68 | +def make_brace_triple(target_str, brace_tags): | ||
69 | + if target_str == '': | ||
70 | + return [] | ||
71 | + idx = find_verb_idx(brace_tags) | ||
72 | + subj = target_str | ||
73 | + pred = '' | ||
74 | + if idx != -1: | ||
75 | + pred = brace_tags[idx] | ||
76 | + obj = ' '.join([value for value, _ in brace_tags[idx:]]) | ||
77 | + else: | ||
78 | + pred = make_be_verb(subj) | ||
79 | + obj = ' '.join([value for value, _ in brace_tags]) | ||
80 | + return [subj, pred, obj] |
-
Please register or login to post a comment