homeplus.py 8.34 KB
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import urllib.request
import time


def name_preprocessing(a):
    if a[0] == ' ':
        a = a[1:]
    num = a.count('_')
    for i in range(num):
        a = a.replace('_', '')
    if a.find('G') != -1:
        temp = a.find('G')
        if temp > 3:
            for i in range(temp - 1, 0, -1):
                if ord(a[i]) < 48 or ord(a[i]) > 57:
                    end = i
                    break
            a = a[:end + 1]


    if a.find('L') != -1:
        if a.find('M') != -1:
            temp = a.find("M")
            for i in range(temp - 1, 0, -1):
                if ord(a[i]) < 48 or ord(a[i]) > 57:
                    end = i
                    break
        else:
            temp = a.find('L')
            for i in range(temp - 1, 0, -1):
                if ord(a[i]) < 48 or ord(a[i]) > 57:
                    end = i
                    break
        a = a[:end + 1]
    if a[-1] == '.':
        a = a[:-2]
    temp1 = a.find('(')
    temp2 = a.find(')')
    if temp1 == 0:
        a = a[temp2+1:]
    return a


def unit_preprocessing(a, b):
    if a[0] == ' ':
        a = a[1:]
    unit = a.find(b)
    b = a[:unit]
    temp = a.find('당')
    won = a.find('원')
    a = a[temp + 2:won]
    if a.find('.') != -1:
        won = a.find('.')
        a = a[:won]
    a = a.replace(',', '')
    unit = int(int(a) / int(b) * 100)
    return unit


def home_water(H):
    page_num = 3
    path = 'C:\chromedriver'
    driver = webdriver.Chrome(path)
    driver.get(
        'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60386&WT.ac=GNBctg_2dep_ctg')

    for i in range(page_num-1):
        driver.find_element_by_xpath("//a[@class='more']").click()
        time.sleep(10)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find_all('ul', {'class' : 'fix-ty2'})
    for five in table:
        one = five.find_all('li')
        for i in one:
            temp_dict = {}
            a = i.find('div', {'class' : 'exh-comtxt w-ty4'})
            if type(a) == type(None):
                continue

            price = a.find('span', {'class' : 'price'})
            unit = price.find('span', {'class' : 'unit-area'})
            if type(unit) == type(None):
                continue
            price = price.find('strong', {'class' : 'buy'})
            name = a.find('a', {'class' : 'name'})
            if type(name) == type(None):
                continue

            name = name_preprocessing(name.text)
            price = price.text
            price = price.replace(',', '')
            unit = unit_preprocessing(unit.text, 'ML')

            overlap = False
            change = 0
            for qw in range(len(H)):
                if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
                    change = qw
                    overlap = True

            if overlap == False:
                soup = i.find('a', {'class': 'thumb'})
                if type(soup) == type(None):
                    continue
                imgUrl = soup.find('img')['src']
                imgUrl = 'http:' + imgUrl
                temp_dict['mart'] = 'homeplus'
                temp_dict['name'] = name
                temp_dict['price'] = price
                temp_dict['unit'] = unit
                temp_dict['img'] = imgUrl
                H.append(temp_dict)

            else:
                if H[change]['unit'] > unit:
                    H[change]['price'] = price
                    H[change]['unit'] = unit
                continue
    print('홈플러스 물')
    print(H)
    print(len(H))


def home_snack(H):
    page_num = 17
    path = 'C:\chromedriver'
    driver = webdriver.Chrome(path)
    driver.get(
        'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60343&WT.ac=GNBctg_2dep_ctg')

    for i in range(page_num - 1):
        driver.find_element_by_xpath("//a[@class='more']").click()
        time.sleep(10)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find_all('ul', {'class': 'fix-ty2'})
    for five in table:
        one = five.find_all('li')
        for i in one:
            temp_dict = {}
            a = i.find('div', {'class': 'exh-comtxt w-ty4'})
            if type(a) == type(None):
                continue

            price = a.find('span', {'class': 'price'})
            unit = price.find('span', {'class': 'unit-area'})

            if type(unit) == type(None):
                continue
            price = price.find('strong', {'class': 'buy'})
            name = a.find('a', {'class': 'name'})
            if type(name) == type(None):
                continue
            unit = unit.text
            if unit.find('ML') != -1:
                continue
            if unit.find('개') != -1:
                continue
            name = name_preprocessing(name.text)
            price = price.text
            price = price.replace(',', '')
            unit = unit_preprocessing(unit, 'G')

            overlap = False
            change = 0
            for qw in range(len(H)):
                if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
                    change = qw
                    overlap = True

            if overlap == False:
                soup = i.find('a', {'class': 'thumb'})
                if type(soup) == type(None):
                    continue
                imgUrl = soup.find('img')['src']
                imgUrl = 'http:' + imgUrl
                temp_dict['mart'] = 'homeplus'
                temp_dict['name'] = name
                temp_dict['price'] = price
                temp_dict['unit'] = unit
                temp_dict['img'] = imgUrl
                H.append(temp_dict)

            else:
                if H[change]['unit'] > unit:
                    H[change]['price'] = price
                    H[change]['unit'] = unit
                continue

    print('홈플러스 과자')
    print(H)
    print(len(H))



def home_coffee(H):
    page_num = 2
    path = 'C:\chromedriver'
    driver = webdriver.Chrome(path)
    driver.get(
        'http://www.homeplus.co.kr/app.exhibition.category.Category.ghs?comm=category.list&cid=60406&WT.ac=GNBctg_3dep_ctg')

    for i in range(page_num - 1):
        driver.find_element_by_xpath("//a[@class='more']").click()
        time.sleep(10)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find_all('ul', {'class': 'fix-ty2'})
    for five in table:
        one = five.find_all('li')
        for i in one:
            temp_dict = {}
            a = i.find('div', {'class': 'exh-comtxt w-ty4'})
            if type(a) == type(None):
                continue

            price = a.find('span', {'class': 'price'})
            unit = price.find('span', {'class': 'unit-area'})

            if type(unit) == type(None):
                continue
            price = price.find('strong', {'class': 'buy'})
            name = a.find('a', {'class': 'name'})
            if type(name) == type(None):
                continue
            unit = unit.text
            if unit.find('G') != -1:
                continue

            name = name_preprocessing(name.text)
            price = price.text
            price = price.replace(',', '')
            unit = unit_preprocessing(unit, 'ML')

            overlap = False
            change = 0
            for qw in range(len(H)):
                if H[qw]['name'] == name and H[qw]['mart'] == 'homeplus':
                    change = qw
                    overlap = True

            if overlap == False:
                soup = i.find('a', {'class': 'thumb'})
                if type(soup) == type(None):
                    continue
                imgUrl = soup.find('img')['src']
                imgUrl = 'http:' + imgUrl
                temp_dict['mart'] = 'homeplus'
                temp_dict['name'] = name
                temp_dict['price'] = price
                temp_dict['unit'] = unit
                temp_dict['img'] = imgUrl
                H.append(temp_dict)

            else:
                if H[change]['unit'] > unit:
                    H[change]['price'] = price
                    H[change]['unit'] = unit
                continue

    print('홈플러스 커피')
    print(H)
    print(len(H))


H_water = {}
H_snack = {}
H_coffee = {}
#home_water(H_water)
#home_snack(H_snack)
#home_coffee(H_coffee)