Showing
4 changed files
with
103 additions
and
1 deletions
python/google.py
0 → 100644
1 | +from selenium import webdriver | ||
2 | +from selenium.webdriver.common.keys import Keys | ||
3 | +import time | ||
4 | +import urllib.request | ||
5 | +import os | ||
6 | + | ||
7 | +# 구글 드라이버를 크롬 버전에 맞게 설치하고 경로를 입력해준다. | ||
8 | +# 이곳에 크롬 드라이버 경로를 입력해주세요. | ||
9 | + | ||
10 | +chromedriver_path = "/Users/kimtaeyoung/Desktop/project/opensource_termproject/2020-02-OSS-TermProject/python/chromedriver" | ||
11 | + | ||
12 | +# 몇개의 파일을 크롤링할지 | ||
13 | +crawling_num = 500 | ||
14 | + | ||
15 | +# 폴더 생성해주는 함수 | ||
16 | + | ||
17 | +def createFolder(directory): | ||
18 | + try: | ||
19 | + if not os.path.exists(directory): | ||
20 | + os.makedirs(directory) | ||
21 | + except OSError: | ||
22 | + print ('Error: Creating directory. ' + directory) | ||
23 | + | ||
24 | +# 크롤링 할 단어들을 받아온다. | ||
25 | +# 크롤링할 단어는 keywords.txt에 입력하면 된다. | ||
26 | + | ||
27 | +def get_keywords(keywords_file='python/keywords.txt'): | ||
28 | + # read search keywords from file | ||
29 | + with open(keywords_file, 'r', encoding='utf-8-sig') as f: | ||
30 | + text = f.read() | ||
31 | + lines = text.split('\n') | ||
32 | + lines = filter(lambda x: x != '' and x is not None, lines) | ||
33 | + keywords = sorted(set(lines)) | ||
34 | + | ||
35 | + print('{} keywords found: {}'.format(len(keywords), keywords)) | ||
36 | + | ||
37 | + # re-save sorted keywords | ||
38 | + with open(keywords_file, 'w+', encoding='utf-8') as f: | ||
39 | + for keyword in keywords: | ||
40 | + f.write('{}\n'.format(keyword)) | ||
41 | + | ||
42 | + return keywords | ||
43 | + | ||
44 | +#모델 폴더를 만들어준다. | ||
45 | + | ||
46 | +createFolder("python/model/") | ||
47 | + | ||
48 | +for i in get_keywords() : | ||
49 | + | ||
50 | + driver = webdriver.Chrome(chromedriver_path) | ||
51 | + driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl") | ||
52 | + elem = driver.find_element_by_name("q") | ||
53 | + elem.send_keys(i) | ||
54 | + elem.send_keys(Keys.RETURN) | ||
55 | + | ||
56 | + SCROLL_PAUSE_TIME = 1 | ||
57 | + # Get scroll height | ||
58 | + last_height = driver.execute_script("return document.body.scrollHeight") | ||
59 | + while True: | ||
60 | + # Scroll down to bottom | ||
61 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
62 | + # Wait to load page | ||
63 | + time.sleep(SCROLL_PAUSE_TIME) | ||
64 | + # Calculate new scroll height and compare with last scroll height | ||
65 | + new_height = driver.execute_script("return document.body.scrollHeight") | ||
66 | + if new_height == last_height: | ||
67 | + try: | ||
68 | + driver.find_element_by_css_selector(".mye4qd").click() | ||
69 | + except: | ||
70 | + break | ||
71 | + last_height = new_height | ||
72 | + | ||
73 | + images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd") | ||
74 | + count = 1 | ||
75 | + | ||
76 | + #폴더가 없다면 폴더를 만들어준다. | ||
77 | + createFolder("python/model/"+i) | ||
78 | + | ||
79 | + for image in images: | ||
80 | + try: | ||
81 | + image.click() | ||
82 | + time.sleep(2) | ||
83 | + imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src") | ||
84 | + opener=urllib.request.build_opener() | ||
85 | + opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')] | ||
86 | + urllib.request.install_opener(opener) | ||
87 | + urllib.request.urlretrieve(imgUrl, "python/model/"+i+"/"+str(count) + ".jpg") | ||
88 | + count = count + 1 | ||
89 | + | ||
90 | + # 크롤링할 개수 설정. | ||
91 | + if count > crawling_num: | ||
92 | + break | ||
93 | + except: | ||
94 | + pass | ||
95 | + | ||
96 | + driver.close() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
python/keywords.txt
0 → 100644
python/requirements.txt
0 → 100644
1 | +selenium==3.141.0 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment