google_image_crwaling.py
1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
from lxml.html import parse
from io import StringIO
import os, sys
from PIL import Image
import urllib.request
# 검색할 이미지의 키워드 입력
keyword = input("검색할 이미지를 입력하세요 : ")
url = 'https://www.google.co.kr/search?q='+keyword+'&source=lnms&tbm=isch&sa=X&ved=0ahUKEwic-taB9IXVAhWDHpQKHXOjC14Q_AUIBigB&biw=1842&bih=990'
# html 소스 가져오기
text = requests.get(url).text
# html 문서로 파싱
text_source = StringIO(text)
parsed = parse(text_source)
# root node
doc = parsed.getroot()
# img 경로는 img 태그안에 src에 있음(20개 크롤링)
imgs = doc.findall('.//img')
img_list = [] # 이미지 경로가 담길 list
cnt=0
for a in imgs:
if cnt>0 and cnt<11:
img_list.append(a.get('src'))
image_url = a.get('src')
filename = keyword + str(cnt) + '.jpg'
fd = os.open(filename, os.O_WRONLY|os.O_BINARY|os.O_CREAT)
ud = urllib.request.urlopen(image_url)
binary = ud.read()
os.write(fd, binary)
os.close(fd)
print(filename + ' download complete')
cnt = cnt+1