update multiprocessing for crwaling

- 셀레니움은 직접 크롬드라이버를 통해 html을 읽기 때문에 느리고, 크롬창이 켜진다. - 이를 보안하기 위해 크롬창을 headless를 통해 안보게 해주었다. - 멀티프로세싱을 통해 한번에 3개 크롬드라이버를 띄워 단어 3개에 대해 동시에 크롤링 할 수 있게 했다.

update multiprocessing for crwaling
- 셀레니움은 직접 크롬드라이버를 통해 html을 읽기 때문에 느리고, 크롬창이 켜진다. - 이를 보안하기 위해 크롬창을 headless를 통해 안보게 해주었다. - 멀티프로세싱을 통해 한번에 3개 크롬드라이버를 띄워 단어 3개에 대해 동시에 크롤링 할 수 있게 했다.
김태영
Commit adc78f9ced2d5182d4ef4fd3d1a7c24e2e35ed6d adc78f9c 1 parent f4d382cb
Showing 2 changed files with 28 additions and 16 deletions
python/google.py
python/requirements.txt
--- a/python/google.py
View file @adc78f9
+++ b/python/google.py
View file @adc78f9
@@ -3,17 +3,23 @@ from selenium.webdriver.common.keys import Keys
 import time
 import urllib.request
 import os
+from multiprocessing import Pool #멀티쓰레딩
 # 구글 드라이버를 크롬 버전에 맞게 설치하고 경로를 입력해준다.
 # 이곳에 크롬 드라이버 경로를 입력해주세요.
-
 chromedriver_path = "/Users/kimtaeyoung/Desktop/project/opensource_termproject/2020-02-OSS-TermProject/python/chromedriver"
-
 # 몇개의 파일을 크롤링할지
 crawling_num = 500
-# 폴더 생성해주는 함수
+# headless 셀레니움
+# 크롬 드라이버에 적용할 옵션들
+options = webdriver.ChromeOptions()
+options.add_argument('headless')
+options.add_argument('window-size=1920x1080')
+options.add_argument("disable-gpu")
+# 혹은 options.add_argument("--disable-gpu")
+# 폴더를 확인하고 없다면 만들어준다.
 def createFolder(directory):
     try:
         if not os.path.exists(directory):
@@ -23,7 +29,6 @@ def createFolder(directory):
 # 크롤링 할 단어들을 받아온다. 
 # 크롤링할 단어는 keywords.txt에 입력하면 된다. 
-
 def get_keywords(keywords_file='python/keywords.txt'):
         # read search keywords from file
         with open(keywords_file, 'r', encoding='utf-8-sig') as f:
@@ -41,16 +46,13 @@ def get_keywords(keywords_file='python/keywords.txt'):
         return keywords
-#모델 폴더를 만들어준다. 
+# 크롤링
-
+# get_keywords의 결과를 파라미터로 넣어주면 된다. 
-createFolder("python/model/")
+def crawling(search_name):
-
+    driver = webdriver.Chrome(chromedriver_path, chrome_options=options) #headless를 위한 옵션을 추가
-for i in get_keywords() :
-   
-    driver = webdriver.Chrome(chromedriver_path)
     driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl")
     elem = driver.find_element_by_name("q")
-    elem.send_keys(i)
+    elem.send_keys(search_name)
     elem.send_keys(Keys.RETURN)
     SCROLL_PAUSE_TIME = 1
@@ -74,17 +76,18 @@ for i in get_keywords() :
     count = 1
     #폴더가 없다면 폴더를 만들어준다. 
-    createFolder("python/model/"+i)
+    createFolder("python/model/"+search_name)
     for image in images:
         try:
             image.click()
+            # 이미지가 로딩되는 속도. 안정적으로는 2-3초가 적당.
             time.sleep(2)
             imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
             opener=urllib.request.build_opener()
             opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
             urllib.request.install_opener(opener)
-            urllib.request.urlretrieve(imgUrl, "python/model/"+i+"/"+str(count) + ".jpg")
+            urllib.request.urlretrieve(imgUrl, "python/model/"+search_name+"/"+str(count) + ".jpg")
             count = count + 1
             # 크롤링할 개수 설정. 
@@ -93,4 +96,13 @@ for i in get_keywords() :
         except:
             pass
-    driver.close()
\ No newline at end of file
+    # 끝나고 크롬 드라이버를 종료해준다
+    driver.close()
+
+if __name__ == '__main__':
+
+    #모델 폴더를 만들어준다. 
+    createFolder("python/model/")
+    
+    pool = Pool(processes=3) # 3개의 프로세스를 사용합니다.
+    pool.map(crawling, get_keywords()) 
\ No newline at end of file
--- a/python/requirements.txt
View file @adc78f9
+++ b/python/requirements.txt
View file @adc78f9
-selenium==3.141.0
\ No newline at end of file
+selenium
\ No newline at end of file