Merge branch 'feature/crawling' into 'master'
Feature/crawling teachable machine 모델 만드는 데 쓰이는 이미지 크롤링 See merge request !6
Showing
4 changed files
with
115 additions
and
1 deletions
python/google.py
0 → 100644
| 1 | +from selenium import webdriver | ||
| 2 | +from selenium.webdriver.common.keys import Keys | ||
| 3 | +import time | ||
| 4 | +import urllib.request | ||
| 5 | +import os | ||
| 6 | +from multiprocessing import Pool #멀티쓰레딩 | ||
| 7 | + | ||
| 8 | +# 구글 드라이버를 크롬 버전에 맞게 설치하고 경로를 입력해준다. | ||
| 9 | +# 이곳에 크롬 드라이버 경로를 입력해주세요. | ||
| 10 | +chromedriver_path = "/Users/kimtaeyoung/Desktop/project/opensource_termproject/2020-02-OSS-TermProject/python/chromedriver" | ||
| 11 | +# 몇개의 파일을 크롤링할지 | ||
| 12 | +crawling_num = 500 | ||
| 13 | + | ||
| 14 | +# headless 셀레니움 | ||
| 15 | +# 크롬 드라이버에 적용할 옵션들 | ||
| 16 | +options = webdriver.ChromeOptions() | ||
| 17 | +options.add_argument('headless') | ||
| 18 | +options.add_argument('window-size=1920x1080') | ||
| 19 | +options.add_argument("disable-gpu") | ||
| 20 | +# 혹은 options.add_argument("--disable-gpu") | ||
| 21 | + | ||
| 22 | +# 폴더를 확인하고 없다면 만들어준다. | ||
| 23 | +def createFolder(directory): | ||
| 24 | + try: | ||
| 25 | + if not os.path.exists(directory): | ||
| 26 | + os.makedirs(directory) | ||
| 27 | + except OSError: | ||
| 28 | + print ('Error: Creating directory. ' + directory) | ||
| 29 | + | ||
| 30 | +# 크롤링 할 단어들을 받아온다. | ||
| 31 | +# 크롤링할 단어는 keywords.txt에 입력하면 된다. | ||
| 32 | +def get_keywords(keywords_file='python/keywords.txt'): | ||
| 33 | + # read search keywords from file | ||
| 34 | + with open(keywords_file, 'r', encoding='utf-8-sig') as f: | ||
| 35 | + text = f.read() | ||
| 36 | + lines = text.split('\n') | ||
| 37 | + lines = filter(lambda x: x != '' and x is not None, lines) | ||
| 38 | + keywords = sorted(set(lines)) | ||
| 39 | + | ||
| 40 | + print('{} keywords found: {}'.format(len(keywords), keywords)) | ||
| 41 | + | ||
| 42 | + # re-save sorted keywords | ||
| 43 | + with open(keywords_file, 'w+', encoding='utf-8') as f: | ||
| 44 | + for keyword in keywords: | ||
| 45 | + f.write('{}\n'.format(keyword)) | ||
| 46 | + | ||
| 47 | + return keywords | ||
| 48 | + | ||
| 49 | +# 크롤링 | ||
| 50 | +# get_keywords의 결과를 파라미터로 넣어주면 된다. | ||
| 51 | +def crawling(search_name): | ||
| 52 | + driver = webdriver.Chrome(chromedriver_path, chrome_options=options) #headless를 위한 옵션을 추가 | ||
| 53 | + driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl") | ||
| 54 | + elem = driver.find_element_by_name("q") | ||
| 55 | + elem.send_keys(search_name) | ||
| 56 | + elem.send_keys(Keys.RETURN) | ||
| 57 | + | ||
| 58 | + SCROLL_PAUSE_TIME = 1 | ||
| 59 | + # Get scroll height | ||
| 60 | + last_height = driver.execute_script("return document.body.scrollHeight") | ||
| 61 | + while True: | ||
| 62 | + # Scroll down to bottom | ||
| 63 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
| 64 | + # Wait to load page | ||
| 65 | + time.sleep(SCROLL_PAUSE_TIME) | ||
| 66 | + # Calculate new scroll height and compare with last scroll height | ||
| 67 | + new_height = driver.execute_script("return document.body.scrollHeight") | ||
| 68 | + if new_height == last_height: | ||
| 69 | + try: | ||
| 70 | + driver.find_element_by_css_selector(".mye4qd").click() | ||
| 71 | + except: | ||
| 72 | + break | ||
| 73 | + last_height = new_height | ||
| 74 | + | ||
| 75 | + images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd") | ||
| 76 | + count = 1 | ||
| 77 | + | ||
| 78 | + #폴더가 없다면 폴더를 만들어준다. | ||
| 79 | + createFolder("python/model/"+search_name) | ||
| 80 | + | ||
| 81 | + for image in images: | ||
| 82 | + try: | ||
| 83 | + image.click() | ||
| 84 | + # 이미지가 로딩되는 속도. 안정적으로는 2-3초가 적당. | ||
| 85 | + time.sleep(2) | ||
| 86 | + imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src") | ||
| 87 | + opener=urllib.request.build_opener() | ||
| 88 | + opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')] | ||
| 89 | + urllib.request.install_opener(opener) | ||
| 90 | + urllib.request.urlretrieve(imgUrl, "python/model/"+search_name+"/"+str(count) + ".jpg") | ||
| 91 | + count = count + 1 | ||
| 92 | + | ||
| 93 | + # 크롤링할 개수 설정. | ||
| 94 | + if count > crawling_num: | ||
| 95 | + break | ||
| 96 | + except: | ||
| 97 | + pass | ||
| 98 | + | ||
| 99 | + # 끝나고 크롬 드라이버를 종료해준다 | ||
| 100 | + driver.close() | ||
| 101 | + | ||
| 102 | +if __name__ == '__main__': | ||
| 103 | + | ||
| 104 | + #모델 폴더를 만들어준다. | ||
| 105 | + createFolder("python/model/") | ||
| 106 | + | ||
| 107 | + pool = Pool(processes=3) # 3개의 프로세스를 사용합니다. | ||
| 108 | + pool.map(crawling, get_keywords()) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
python/keywords.txt
0 → 100644
python/requirements.txt
0 → 100644
| 1 | +selenium | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment