김태영

add crawling google image

 - 구글에서 이미지를 크롤링 해온다.
1 -/node_modules/
...\ No newline at end of file ...\ No newline at end of file
1 +/node_modules/
2 +/python/model
3 +/python/chromedriver
...\ No newline at end of file ...\ No newline at end of file
......
1 +from selenium import webdriver
2 +from selenium.webdriver.common.keys import Keys
3 +import time
4 +import urllib.request
5 +import os
6 +
7 +# 구글 드라이버를 크롬 버전에 맞게 설치하고 경로를 입력해준다.
8 +# 이곳에 크롬 드라이버 경로를 입력해주세요.
9 +
10 +chromedriver_path = "/Users/kimtaeyoung/Desktop/project/opensource_termproject/2020-02-OSS-TermProject/python/chromedriver"
11 +
12 +# 몇개의 파일을 크롤링할지
13 +crawling_num = 500
14 +
15 +# 폴더 생성해주는 함수
16 +
17 +def createFolder(directory):
18 + try:
19 + if not os.path.exists(directory):
20 + os.makedirs(directory)
21 + except OSError:
22 + print ('Error: Creating directory. ' + directory)
23 +
24 +# 크롤링 할 단어들을 받아온다.
25 +# 크롤링할 단어는 keywords.txt에 입력하면 된다.
26 +
27 +def get_keywords(keywords_file='python/keywords.txt'):
28 + # read search keywords from file
29 + with open(keywords_file, 'r', encoding='utf-8-sig') as f:
30 + text = f.read()
31 + lines = text.split('\n')
32 + lines = filter(lambda x: x != '' and x is not None, lines)
33 + keywords = sorted(set(lines))
34 +
35 + print('{} keywords found: {}'.format(len(keywords), keywords))
36 +
37 + # re-save sorted keywords
38 + with open(keywords_file, 'w+', encoding='utf-8') as f:
39 + for keyword in keywords:
40 + f.write('{}\n'.format(keyword))
41 +
42 + return keywords
43 +
44 +#모델 폴더를 만들어준다.
45 +
46 +createFolder("python/model/")
47 +
48 +for i in get_keywords() :
49 +
50 + driver = webdriver.Chrome(chromedriver_path)
51 + driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl")
52 + elem = driver.find_element_by_name("q")
53 + elem.send_keys(i)
54 + elem.send_keys(Keys.RETURN)
55 +
56 + SCROLL_PAUSE_TIME = 1
57 + # Get scroll height
58 + last_height = driver.execute_script("return document.body.scrollHeight")
59 + while True:
60 + # Scroll down to bottom
61 + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
62 + # Wait to load page
63 + time.sleep(SCROLL_PAUSE_TIME)
64 + # Calculate new scroll height and compare with last scroll height
65 + new_height = driver.execute_script("return document.body.scrollHeight")
66 + if new_height == last_height:
67 + try:
68 + driver.find_element_by_css_selector(".mye4qd").click()
69 + except:
70 + break
71 + last_height = new_height
72 +
73 + images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
74 + count = 1
75 +
76 + #폴더가 없다면 폴더를 만들어준다.
77 + createFolder("python/model/"+i)
78 +
79 + for image in images:
80 + try:
81 + image.click()
82 + time.sleep(2)
83 + imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
84 + opener=urllib.request.build_opener()
85 + opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
86 + urllib.request.install_opener(opener)
87 + urllib.request.urlretrieve(imgUrl, "python/model/"+i+"/"+str(count) + ".jpg")
88 + count = count + 1
89 +
90 + # 크롤링할 개수 설정.
91 + if count > crawling_num:
92 + break
93 + except:
94 + pass
95 +
96 + driver.close()
...\ No newline at end of file ...\ No newline at end of file
1 +inside background
2 +person
3 +with mask
1 +selenium==3.141.0
...\ No newline at end of file ...\ No newline at end of file