양선아

data crawling 1

1 +# -*- coding: utf-8 -*-
2 +
3 +from bs4 import BeautifulSoup
4 +from urllib.request import urlopen
5 +import pandas as pd
6 +import requests
7 +import os
8 +
9 +url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20141119000000000012_1/'
10 +
11 +number = 0
12 +
13 +def makecsvfile(day):
14 +
15 + # file name setting
16 + output_file = str(day)+'data.csv'
17 +
18 + # dataframe 생성
19 + df = pd.DataFrame(columns=['row_num','aucng_de', 'pblmng_whsal_mrkt_nm','pblmng_whsal_mrkt_cd', 'cpr_nm', 'cpr_cd', 'prdlst_nm', 'prdlst_cd', 'spcies_nm','spcies_cd','grad','grad_cd','delngbundle_qy','stndrd','stndrd_cd','delng_qy','mumm_amt','avrg_amt','mxmm_amt','auc_co'])
20 +
21 + # 기본 number setting
22 + i = 0 # 날짜별 row
23 + number = 0
24 +
25 + while(True):
26 +
27 + # url 생성 & data 받아오기
28 + myurl = url + str(number*1000+1) + '/'+str((number+1)*1000) + '?AUCNG_DE='+str(day)
29 + data = urlopen(myurl).read()
30 + soup = BeautifulSoup(data, 'html.parser')
31 +
32 + # data error check
33 + result_code = soup.find('result')
34 + result_code = result_code.code.string
35 + if result_code != 'INFO-000':
36 + print(result_code)
37 + break
38 +
39 + # data number check
40 + start_num = int(str(soup.find('startrow'))[10:-11])
41 + total_num = int(str(soup.find('totalcnt'))[10:-11])
42 + print(str(soup.find('startrow'))[10:-11])
43 + if total_num < start_num :
44 + print('find all')
45 + break
46 +
47 + # if result is fine
48 + items = soup.find_all('row')
49 + for item in items:
50 + df.loc[i] = [item.row_num.string, item.aucng_de.string, item.pblmng_whsal_mrkt_nm.string, item.pblmng_whsal_mrkt_cd.string, item.cpr_nm.string, item.cpr_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.grad.string, item.grad_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.delng_qy.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.auc_co.string]
51 + i += 1
52 +
53 + # 다음 1000개
54 + number += 1
55 +
56 + # 결과 확인을 위한 print
57 + print(str(day), ' : ', str(i))
58 + # csv 파일로 내보내기
59 + df.to_csv(os.path.join('./', output_file), encoding='euc-kr', index=False)
60 +
61 +def checkdatanum(day):
62 + myurl = url +'1/1?AUCNG_DE='+str(day)
63 +
64 + req = requests.get(myurl)
65 + html = req.text
66 + soup = BeautifulSoup(html, 'html.parser')
67 + product_num = soup.find('totalcnt')
68 + product_num = int(str(product_num)[10:-11])
69 + print(day,':',product_num)
70 + return product_num
71 +
72 +
73 +days=[20200413, 20200414, 20200415, 20200416, 20200417, 20200418, 20200412]
74 +
75 +for day in days:
76 + number += checkdatanum(day)
77 +
78 +print('week : ', number)
No preview for this file type
No preview for this file type