find_url.py 961 Bytes

Raw Blame History Permalink

import re


'''
test cast:

lines1 = "['mailto:psalms1273@merryyear.org', 'http://www.merryyear.org/abbs/?act=bbs&amp;subAct=view&amp;bid=Notice&amp;page=1&amp;order_index=no&amp;order_type=desc&amp;seq=1571']"
lines = "['https://www.shinhanhope.com/web/main.jsp']"
ll ="['https://www.childfund.or.kr/news/noticeView.do?bmTemplate=/inc/jsp/board/template/default&amp;bdId=20019410&amp;bmIds=10000023,10000097']"
l="http://bokjiro.go.kr/gowf/wel/welsvc/svcsearch/WelGvmtSvcSearchView.do?servId=WII00000124"
t = "https://welfare.gangdong.go.kr/site/contents/bokji/html00/html00/index3.html"
'''

def find_url_in_str(url) :
#    regex = re.compile(r'(http|https):\/\/(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?[^\'\]](\/|\/([\w#!:.?+=&%@!\-\/]))?', re.IGNORECASE)
    regex = re.compile(r'https?://(\w*:\w*@)?[-\w.]+(:\d+)?(/([\w/_.]*(\?\S+)?)?[^\'\]])?', re.IGNORECASE)

    m = regex.search(url)
    if m != None:
        return m.group()
    return None

find_url_in_str()