Toggle navigation
Toggle navigation
This project
Loading...
Sign in
2021-1-capstone-design1
/
LYG_Project1
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
정의동
2021-05-11 17:35:04 +0900
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
2b953ef0c70c5b9aac15a2dbb2bb3f1cdb07604e
2b953ef0
1 parent
c1eefa6c
Add: source lib 추가
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
81 additions
and
4 deletions
.gitignore
src/lib/util.py
.gitignore
View file @
2b953ef
# OS Generated files
*.DS_Store
*.DS_Store?
._*
\ No newline at end of file
**/__pycache__
...
...
src/lib/util.py
0 → 100644
View file @
2b953ef
from
nltk.tokenize
import
word_tokenize
import
nltk
import
re
from
bs4
import
BeautifulSoup
import
requests
def
get_HTML_from_url
(
url
):
return
requests
.
get
(
url
)
.
text
def
get_text_from_HTML
(
html
):
soup
=
BeautifulSoup
(
html
)
script_tag
=
soup
.
find_all
([
'script'
,
'style'
,
'header'
,
'footer'
,
'form'
])
for
script
in
script_tag
:
script
.
extract
()
content
=
soup
.
get_text
(
'
\n
'
,
strip
=
True
)
return
content
# def get_HTML_from_regexp_url(url_pattern):
def
is_string
(
target
):
return
type
(
target
)
==
str
def
cut_corpus
(
corpus
):
if
not
is_string
(
corpus
):
return
[]
return
corpus
.
split
(
'.'
)[:
-
1
]
def
postag_sentence
(
sentence
):
if
not
is_string
(
sentence
):
return
[]
tags
=
word_tokenize
(
sentence
)
return
nltk
.
pos_tag
(
tags
)
# verb의 index를 return 합니다.
# 만약, 존재하지 않는다면, -1을 return 합니다.
def
find_verb_idx
(
tags
):
idx
=
0
for
tag
in
tags
:
if
tag
[
0
][
1
]
==
'V'
:
return
idx
return
-
1
def
make_be_verb
(
subj
):
if
subj
==
'I'
:
return
'am'
elif
subj
in
[
'You'
,
'you'
]:
return
'are'
else
:
return
'is'
def
cut_quot
(
sentence
):
return
re
.
sub
(
"[
\'\"
\
`]"
,
''
,
sentence
)
# 예외
# 1. brace가 닫히지 않음
# 2. target_str가 없음
def
make_brace_triple
(
target_str
,
brace_tags
):
if
target_str
==
''
:
return
[]
idx
=
find_verb_idx
(
brace_tags
)
subj
=
target_str
pred
=
''
if
idx
!=
-
1
:
pred
=
brace_tags
[
idx
]
obj
=
' '
.
join
([
value
for
value
,
_
in
brace_tags
[
idx
:]])
else
:
pred
=
make_be_verb
(
subj
)
obj
=
' '
.
join
([
value
for
value
,
_
in
brace_tags
])
return
[
subj
,
pred
,
obj
]
Please
register
or
login
to post a comment