crawling

최인훈
Commit 92e2debc5ac2e42fd76220761113fe8b104197c1 92e2debc 1 parent b492d7d3
Showing 12 changed files with 389 additions and 34 deletions
111.py
E_coffee.json
KhuPang
crawling.py
homeplus.py
json/coffee.json
json/snack.json
json/water.json
lottemart.py
main.py
practice.py
pymysql.py
--- a/111.py 0 → 100644
View file @92e2deb
+++ b/111.py 0 → 100644
View file @92e2deb
+ import json
+ 
+ with open('water.json', 'r', encoding='utf-8') as make_file:
+     a = json.load(make_file)
+     print(aa)
\ No newline at end of file
--- a/E_coffee.json 0 → 100644
View file @92e2deb
+++ b/E_coffee.json 0 → 100644
View file @92e2deb
+ {}
\ No newline at end of file
--- a/KhuPang @ 12d49c19
+++ b/KhuPang @ 12d49c19
+ Subproject commit 12d49c195cc74e4ed59ccc44029a6fc4cafd5f6c
--- a/crawling.py
View file @92e2deb
+++ b/crawling.py
View file @92e2deb
--- a/homeplus.py 0 → 100644
View file @92e2deb
+++ b/homeplus.py 0 → 100644
View file @92e2deb
+ from selenium import webdriver
+ from bs4 import BeautifulSoup
+ import requests
+ import urllib.request
+ import time
+ 
+ 
+ def name_preprocessing(a):
+     if a[0] == ' ':
+         a = a[1:]
+     num = a.count('_')
+     for i in range(num):
+         a = a.replace('_', '')
+     if a.find('G') != -1:
+         temp = a.find('G')
+         if temp > 3:
+             for i in range(temp - 1, 0, -1):
+                 if ord(a[i]) < 48 or ord(a[i]) > 57:
+                     end = i
+                     break
+             a = a[:end + 1]
+ 
+ 
+     if a.find('L') != -1:
+         if a.find('M') != -1:
+             temp = a.find("M")
+             for i in range(temp - 1, 0, -1):
+                 if ord(a[i]) < 48 or ord(a[i]) > 57:
+                     end = i
+                     break
+         else:
+             temp = a.find('L')
+             for i in range(temp - 1, 0, -1):
+                 if ord(a[i]) < 48 or ord(a[i]) > 57:
+                     end = i
+                     break
+         a = a[:end + 1]
+     if a[-1] == '.':
+         a = a[:-2]
+     temp1 = a.find('(')
+     temp2 = a.find(')')
+     if temp1 == 0:
+         a = a[temp2+1:]
+     return a
+ 
+ 
+ def unit_preprocessing(a, b):
+     if a[0] == ' ':
+         a = a[1:]
+     unit = a.find(b)
+     b = a[:unit]
+     temp = a.find('당')
+     won = a.find('원')
+     a = a[temp + 2:won]
+     if a.find('.') != -1:
+         won = a.find('.')
+         a = a[:won]
+     a = a.replace(',', '')
+     unit = int(int(a) / int(b) * 100)
+     return unit
+ 
+ 
+ def home_water(H):
+     page_num = 3
+     path = 'C:\chromedriver'
+     driver = webdriver.Chrome(path)
+     driver.get(
+         'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60386&WT.ac=GNBctg_2dep_ctg')
+ 
+     for i in range(page_num-1):
+         driver.find_element_by_xpath("//a[@class='more']").click()
+         time.sleep(10)
+ 
+     html = driver.page_source
+     soup = BeautifulSoup(html, 'html.parser')
+     table = soup.find_all('ul', {'class' : 'fix-ty2'})
+     for five in table:
+         one = five.find_all('li')
+         for i in one:
+             temp_dict = {}
+             a = i.find('div', {'class' : 'exh-comtxt w-ty4'})
+             if type(a) == type(None):
+                 continue
+ 
+             price = a.find('span', {'class' : 'price'})
+             unit = price.find('span', {'class' : 'unit-area'})
+             if type(unit) == type(None):
+                 continue
+             price = price.find('strong', {'class' : 'buy'})
+             name = a.find('a', {'class' : 'name'})
+             if type(name) == type(None):
+                 continue
+ 
+             name = name_preprocessing(name.text)
+             price = price.text
+             price = price.replace(',', '')
+             unit = unit_preprocessing(unit.text, 'ML')
+ 
+             overlap = False
+             change = 0
+             for qw in range(len(H)):
+                 if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
+                     change = qw
+                     overlap = True
+ 
+             if overlap == False:
+                 soup = i.find('a', {'class': 'thumb'})
+                 if type(soup) == type(None):
+                     continue
+                 imgUrl = soup.find('img')['src']
+                 imgUrl = 'http:' + imgUrl
+                 temp_dict['mart'] = 'homeplus'
+                 temp_dict['name'] = name
+                 temp_dict['price'] = price
+                 temp_dict['unit'] = unit
+                 temp_dict['img'] = imgUrl
+                 H.append(temp_dict)
+ 
+             else:
+                 if H[change]['unit'] > unit:
+                     H[change]['price'] = price
+                     H[change]['unit'] = unit
+                 continue
+     print('홈플러스 물')
+     print(H)
+     print(len(H))
+ 
+ 
+ def home_snack(H):
+     page_num = 17
+     path = 'C:\chromedriver'
+     driver = webdriver.Chrome(path)
+     driver.get(
+         'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60343&WT.ac=GNBctg_2dep_ctg')
+ 
+     for i in range(page_num - 1):
+         driver.find_element_by_xpath("//a[@class='more']").click()
+         time.sleep(10)
+     html = driver.page_source
+     soup = BeautifulSoup(html, 'html.parser')
+     table = soup.find_all('ul', {'class': 'fix-ty2'})
+     for five in table:
+         one = five.find_all('li')
+         for i in one:
+             temp_dict = {}
+             a = i.find('div', {'class': 'exh-comtxt w-ty4'})
+             if type(a) == type(None):
+                 continue
+ 
+             price = a.find('span', {'class': 'price'})
+             unit = price.find('span', {'class': 'unit-area'})
+ 
+             if type(unit) == type(None):
+                 continue
+             price = price.find('strong', {'class': 'buy'})
+             name = a.find('a', {'class': 'name'})
+             if type(name) == type(None):
+                 continue
+             unit = unit.text
+             if unit.find('ML') != -1:
+                 continue
+             if unit.find('개') != -1:
+                 continue
+             name = name_preprocessing(name.text)
+             price = price.text
+             price = price.replace(',', '')
+             unit = unit_preprocessing(unit, 'G')
+ 
+             overlap = False
+             change = 0
+             for qw in range(len(H)):
+                 if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
+                     change = qw
+                     overlap = True
+ 
+             if overlap == False:
+                 soup = i.find('a', {'class': 'thumb'})
+                 if type(soup) == type(None):
+                     continue
+                 imgUrl = soup.find('img')['src']
+                 imgUrl = 'http:' + imgUrl
+                 temp_dict['mart'] = 'homeplus'
+                 temp_dict['name'] = name
+                 temp_dict['price'] = price
+                 temp_dict['unit'] = unit
+                 temp_dict['img'] = imgUrl
+                 H.append(temp_dict)
+ 
+             else:
+                 if H[change]['unit'] > unit:
+                     H[change]['price'] = price
+                     H[change]['unit'] = unit
+                 continue
+ 
+     print('홈플러스 과자')
+     print(H)
+     print(len(H))
+ 
+ 
+ 
+ def home_coffee(H):
+     page_num = 2
+     path = 'C:\chromedriver'
+     driver = webdriver.Chrome(path)
+     driver.get(
+         'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60406&WT.ac=GNBctg_3dep_ctg')
+ 
+     for i in range(page_num - 1):
+         driver.find_element_by_xpath("//a[@class='more']").click()
+         time.sleep(10)
+     html = driver.page_source
+     soup = BeautifulSoup(html, 'html.parser')
+     table = soup.find_all('ul', {'class': 'fix-ty2'})
+     for five in table:
+         one = five.find_all('li')
+         for i in one:
+             temp_dict = {}
+             a = i.find('div', {'class': 'exh-comtxt w-ty4'})
+             if type(a) == type(None):
+                 continue
+ 
+             price = a.find('span', {'class': 'price'})
+             unit = price.find('span', {'class': 'unit-area'})
+ 
+             if type(unit) == type(None):
+                 continue
+             price = price.find('strong', {'class': 'buy'})
+             name = a.find('a', {'class': 'name'})
+             if type(name) == type(None):
+                 continue
+             unit = unit.text
+             if unit.find('G') != -1:
+                 continue
+ 
+             name = name_preprocessing(name.text)
+             price = price.text
+             price = price.replace(',', '')
+             unit = unit_preprocessing(unit, 'ML')
+ 
+             overlap = False
+             change = 0
+             for qw in range(len(H)):
+                 if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
+                     change = qw
+                     overlap = True
+ 
+             if overlap == False:
+                 soup = i.find('a', {'class': 'thumb'})
+                 if type(soup) == type(None):
+                     continue
+                 imgUrl = soup.find('img')['src']
+                 imgUrl = 'http:' + imgUrl
+                 temp_dict['mart'] = 'homeplus'
+                 temp_dict['name'] = name
+                 temp_dict['price'] = price
+                 temp_dict['unit'] = unit
+                 temp_dict['img'] = imgUrl
+                 H.append(temp_dict)
+ 
+             else:
+                 if H[change]['unit'] > unit:
+                     H[change]['price'] = price
+                     H[change]['unit'] = unit
+                 continue
+ 
+     print('홈플러스 커피')
+     print(H)
+     print(len(H))
+ 
+ 
+ H_water = {}
+ H_snack = {}
+ H_coffee = {}
+ #home_water(H_water)
+ #home_snack(H_snack)
+ #home_coffee(H_coffee)
\ No newline at end of file
--- a/json/coffee.json 0 → 100644
View file @92e2deb
+++ b/json/coffee.json 0 → 100644
View file @92e2deb
--- a/json/snack.json 0 → 100644
View file @92e2deb
+++ b/json/snack.json 0 → 100644
View file @92e2deb
--- a/json/water.json 0 → 100644
View file @92e2deb
+++ b/json/water.json 0 → 100644
View file @92e2deb
--- a/lottemart.py 0 → 100644
View file @92e2deb
+++ b/lottemart.py 0 → 100644
View file @92e2deb
--- a/main.py 0 → 100644
View file @92e2deb
+++ b/main.py 0 → 100644
View file @92e2deb
+ import requests
+ import time
+ from bs4 import BeautifulSoup
+ import json
+ from selenium import webdriver
+ 
+ from crawling import emart_water, emart_snack, emart_coffee
+ from lottemart import lotte_water, lotte_snack, lotte_coffee
+ from homeplus import home_coffee, home_snack, home_water
+ 
+ 
+ water = []
+ snack = []
+ coffee = []
+ 
+ temp_dict = {}
+ 
+ with open('coffee.json', 'r', encoding='utf-8') as read_file:
+     temp_dict = json.load(read_file)
+ 
+ coffee = temp_dict['product']
+ 
+ #emart_water(water)
+ #emart_snack(snack)
+ #emart_coffee(coffee)
+ 
+ #lotte_water(water)
+ #lotte_snack(snack)
+ lotte_coffee(coffee)
+ 
+ #home_water(water)
+ #home_snack(snack)
+ #home_coffee(coffee)
+ 
+ #water_dict = {}
+ #water_dict['product'] = water
+ #snack_dict = {}
+ #snack_dict['product'] = snack
+ coffee_dict = {}
+ coffee_dict['product'] = coffee
+ 
+ with open('coffee.json', 'w', encoding='utf-8') as make_file:
+     json.dump(coffee_dict, make_file, ensure_ascii=False, indent='\t')
+ 
+ 
+ '''
+ with open('water.json', 'w', encoding='utf-8') as make_file:
+     json.dump(water_dict, make_file, ensure_ascii=False, indent='\t')
+ 
+ with open('snack.json', 'w', encoding='utf-8') as make_file:
+     json.dump(snack_dict, make_file, ensure_ascii=False, indent='\t')
+ 
+ 
+ with open('coffee.json', 'w', encoding='utf-8') as make_file:
+     json.dump(coffee_dict, make_file, ensure_ascii=False, indent='\t')
+ '''
\ No newline at end of file
--- a/practice.py
View file @92e2deb
+++ b/practice.py
View file @92e2deb
- a = '[노브랜드] 미네랄 워터 생수 (2L x 6개입)'
- while True:
-     b = a.find('[')
-     c = a.find(']')
-     if b == -1:
-         break
-     elif b == 0:
-         a = a[c + 1:-1]
-     else:
-         break
- while True:
-     d = a.find('(')
-     e = a.find(')')
-     if d == -1:
-         break
-     elif d == 0:
-         a = a[e + 1:-1]
-     else:
-         break
- 
- result = ''
- b = list(a)
- for i in b:
-     if (ord(i) < 48) or (ord(i) > 57):
-         if i == ' ':
-             continue
-         result += i
-     else:
-         break
- while True:
-     if (result[-1] == '(') or (result[-1] == '[') or (result[-1] == ' '):
-         result = result[0:-1]
-     else:
-         break
+ import requests
+ import time
+ from bs4 import BeautifulSoup
+ import json
+ from selenium import webdriver
+ 
+ 
+ def qweq(a):
+     b = {}
+     b['mart'] = 'emart'
+     b['name'] = '삼다수'
+     b['price'] = 12
+ 
+     c = {}
+     c['mart'] = 'emart'
+     c['name'] = '삼다수'
+     c['price'] = 22
+ 
+     a.append(b)
+     a.append(c)
+     print(len(a))
+ 
+ a = ['asas', 'asas']
+ qweq(a)
+ print(a)
+ 
+ 
+ 
 
--- a/pymysql.py 0 → 100644
View file @92e2deb
+++ b/pymysql.py 0 → 100644
View file @92e2deb
+ import pymysql
+ import mysql.connector as mysql
+ 
+ 
+ 
+ db = pymysql.connect(
+     host='localhost',
+     user='root',
+     passwd='111111'
+ )
+ 
+ print(db)
+ 
+ cursor = db.cursor()
+ cursor.execute("SHOW DATABASES")
+ 
+ databases = cursor.fetchall()
+ 
+ print(databases)
+ 
+ for database in databases:
+     print(database)
\ No newline at end of file