최인훈

crawling

import json
with open('water.json', 'r', encoding='utf-8') as make_file:
a = json.load(make_file)
print(aa)
\ No newline at end of file
{}
\ No newline at end of file
KhuPang @ 12d49c19
Subproject commit 12d49c195cc74e4ed59ccc44029a6fc4cafd5f6c
This diff is collapsed. Click to expand it.
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import urllib.request
import time
def name_preprocessing(a):
if a[0] == ' ':
a = a[1:]
num = a.count('_')
for i in range(num):
a = a.replace('_', '')
if a.find('G') != -1:
temp = a.find('G')
if temp > 3:
for i in range(temp - 1, 0, -1):
if ord(a[i]) < 48 or ord(a[i]) > 57:
end = i
break
a = a[:end + 1]
if a.find('L') != -1:
if a.find('M') != -1:
temp = a.find("M")
for i in range(temp - 1, 0, -1):
if ord(a[i]) < 48 or ord(a[i]) > 57:
end = i
break
else:
temp = a.find('L')
for i in range(temp - 1, 0, -1):
if ord(a[i]) < 48 or ord(a[i]) > 57:
end = i
break
a = a[:end + 1]
if a[-1] == '.':
a = a[:-2]
temp1 = a.find('(')
temp2 = a.find(')')
if temp1 == 0:
a = a[temp2+1:]
return a
def unit_preprocessing(a, b):
if a[0] == ' ':
a = a[1:]
unit = a.find(b)
b = a[:unit]
temp = a.find('당')
won = a.find('원')
a = a[temp + 2:won]
if a.find('.') != -1:
won = a.find('.')
a = a[:won]
a = a.replace(',', '')
unit = int(int(a) / int(b) * 100)
return unit
def home_water(H):
page_num = 3
path = 'C:\chromedriver'
driver = webdriver.Chrome(path)
driver.get(
'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60386&WT.ac=GNBctg_2dep_ctg')
for i in range(page_num-1):
driver.find_element_by_xpath("//a[@class='more']").click()
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
table = soup.find_all('ul', {'class' : 'fix-ty2'})
for five in table:
one = five.find_all('li')
for i in one:
temp_dict = {}
a = i.find('div', {'class' : 'exh-comtxt w-ty4'})
if type(a) == type(None):
continue
price = a.find('span', {'class' : 'price'})
unit = price.find('span', {'class' : 'unit-area'})
if type(unit) == type(None):
continue
price = price.find('strong', {'class' : 'buy'})
name = a.find('a', {'class' : 'name'})
if type(name) == type(None):
continue
name = name_preprocessing(name.text)
price = price.text
price = price.replace(',', '')
unit = unit_preprocessing(unit.text, 'ML')
overlap = False
change = 0
for qw in range(len(H)):
if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
change = qw
overlap = True
if overlap == False:
soup = i.find('a', {'class': 'thumb'})
if type(soup) == type(None):
continue
imgUrl = soup.find('img')['src']
imgUrl = 'http:' + imgUrl
temp_dict['mart'] = 'homeplus'
temp_dict['name'] = name
temp_dict['price'] = price
temp_dict['unit'] = unit
temp_dict['img'] = imgUrl
H.append(temp_dict)
else:
if H[change]['unit'] > unit:
H[change]['price'] = price
H[change]['unit'] = unit
continue
print('홈플러스 물')
print(H)
print(len(H))
def home_snack(H):
page_num = 17
path = 'C:\chromedriver'
driver = webdriver.Chrome(path)
driver.get(
'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60343&WT.ac=GNBctg_2dep_ctg')
for i in range(page_num - 1):
driver.find_element_by_xpath("//a[@class='more']").click()
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
table = soup.find_all('ul', {'class': 'fix-ty2'})
for five in table:
one = five.find_all('li')
for i in one:
temp_dict = {}
a = i.find('div', {'class': 'exh-comtxt w-ty4'})
if type(a) == type(None):
continue
price = a.find('span', {'class': 'price'})
unit = price.find('span', {'class': 'unit-area'})
if type(unit) == type(None):
continue
price = price.find('strong', {'class': 'buy'})
name = a.find('a', {'class': 'name'})
if type(name) == type(None):
continue
unit = unit.text
if unit.find('ML') != -1:
continue
if unit.find('개') != -1:
continue
name = name_preprocessing(name.text)
price = price.text
price = price.replace(',', '')
unit = unit_preprocessing(unit, 'G')
overlap = False
change = 0
for qw in range(len(H)):
if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
change = qw
overlap = True
if overlap == False:
soup = i.find('a', {'class': 'thumb'})
if type(soup) == type(None):
continue
imgUrl = soup.find('img')['src']
imgUrl = 'http:' + imgUrl
temp_dict['mart'] = 'homeplus'
temp_dict['name'] = name
temp_dict['price'] = price
temp_dict['unit'] = unit
temp_dict['img'] = imgUrl
H.append(temp_dict)
else:
if H[change]['unit'] > unit:
H[change]['price'] = price
H[change]['unit'] = unit
continue
print('홈플러스 과자')
print(H)
print(len(H))
def home_coffee(H):
page_num = 2
path = 'C:\chromedriver'
driver = webdriver.Chrome(path)
driver.get(
'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60406&WT.ac=GNBctg_3dep_ctg')
for i in range(page_num - 1):
driver.find_element_by_xpath("//a[@class='more']").click()
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
table = soup.find_all('ul', {'class': 'fix-ty2'})
for five in table:
one = five.find_all('li')
for i in one:
temp_dict = {}
a = i.find('div', {'class': 'exh-comtxt w-ty4'})
if type(a) == type(None):
continue
price = a.find('span', {'class': 'price'})
unit = price.find('span', {'class': 'unit-area'})
if type(unit) == type(None):
continue
price = price.find('strong', {'class': 'buy'})
name = a.find('a', {'class': 'name'})
if type(name) == type(None):
continue
unit = unit.text
if unit.find('G') != -1:
continue
name = name_preprocessing(name.text)
price = price.text
price = price.replace(',', '')
unit = unit_preprocessing(unit, 'ML')
overlap = False
change = 0
for qw in range(len(H)):
if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
change = qw
overlap = True
if overlap == False:
soup = i.find('a', {'class': 'thumb'})
if type(soup) == type(None):
continue
imgUrl = soup.find('img')['src']
imgUrl = 'http:' + imgUrl
temp_dict['mart'] = 'homeplus'
temp_dict['name'] = name
temp_dict['price'] = price
temp_dict['unit'] = unit
temp_dict['img'] = imgUrl
H.append(temp_dict)
else:
if H[change]['unit'] > unit:
H[change]['price'] = price
H[change]['unit'] = unit
continue
print('홈플러스 커피')
print(H)
print(len(H))
H_water = {}
H_snack = {}
H_coffee = {}
#home_water(H_water)
#home_snack(H_snack)
#home_coffee(H_coffee)
\ No newline at end of file
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
import requests
import time
from bs4 import BeautifulSoup
import json
from selenium import webdriver
from crawling import emart_water, emart_snack, emart_coffee
from lottemart import lotte_water, lotte_snack, lotte_coffee
from homeplus import home_coffee, home_snack, home_water
water = []
snack = []
coffee = []
temp_dict = {}
with open('coffee.json', 'r', encoding='utf-8') as read_file:
temp_dict = json.load(read_file)
coffee = temp_dict['product']
#emart_water(water)
#emart_snack(snack)
#emart_coffee(coffee)
#lotte_water(water)
#lotte_snack(snack)
lotte_coffee(coffee)
#home_water(water)
#home_snack(snack)
#home_coffee(coffee)
#water_dict = {}
#water_dict['product'] = water
#snack_dict = {}
#snack_dict['product'] = snack
coffee_dict = {}
coffee_dict['product'] = coffee
with open('coffee.json', 'w', encoding='utf-8') as make_file:
json.dump(coffee_dict, make_file, ensure_ascii=False, indent='\t')
'''
with open('water.json', 'w', encoding='utf-8') as make_file:
json.dump(water_dict, make_file, ensure_ascii=False, indent='\t')
with open('snack.json', 'w', encoding='utf-8') as make_file:
json.dump(snack_dict, make_file, ensure_ascii=False, indent='\t')
with open('coffee.json', 'w', encoding='utf-8') as make_file:
json.dump(coffee_dict, make_file, ensure_ascii=False, indent='\t')
'''
\ No newline at end of file
a = '[노브랜드] 미네랄 워터 생수 (2L x 6개입)'
while True:
b = a.find('[')
c = a.find(']')
if b == -1:
break
elif b == 0:
a = a[c + 1:-1]
else:
break
while True:
d = a.find('(')
e = a.find(')')
if d == -1:
break
elif d == 0:
a = a[e + 1:-1]
else:
break
result = ''
b = list(a)
for i in b:
if (ord(i) < 48) or (ord(i) > 57):
if i == ' ':
continue
result += i
else:
break
while True:
if (result[-1] == '(') or (result[-1] == '[') or (result[-1] == ' '):
result = result[0:-1]
else:
break
import requests
import time
from bs4 import BeautifulSoup
import json
from selenium import webdriver
def qweq(a):
b = {}
b['mart'] = 'emart'
b['name'] = '삼다수'
b['price'] = 12
c = {}
c['mart'] = 'emart'
c['name'] = '삼다수'
c['price'] = 22
a.append(b)
a.append(c)
print(len(a))
a = ['asas', 'asas']
qweq(a)
print(a)
......
import pymysql
import mysql.connector as mysql
db = pymysql.connect(
host='localhost',
user='root',
passwd='111111'
)
print(db)
cursor = db.cursor()
cursor.execute("SHOW DATABASES")
databases = cursor.fetchall()
print(databases)
for database in databases:
print(database)
\ No newline at end of file