양선아

python file for data crawling

...@@ -7,7 +7,7 @@ import requests ...@@ -7,7 +7,7 @@ import requests
7 import os 7 import os
8 import datetime 8 import datetime
9 9
10 -url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20141119000000000012_1/' 10 +url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160624000000000348_1/'
11 11
12 def makecsvfile(day): 12 def makecsvfile(day):
13 13
...@@ -15,7 +15,7 @@ def makecsvfile(day): ...@@ -15,7 +15,7 @@ def makecsvfile(day):
15 output_file = str(day)+'data.csv' 15 output_file = str(day)+'data.csv'
16 16
17 # dataframe 생성 17 # dataframe 생성
18 - df = pd.DataFrame(columns=['row_num','aucng_de', 'pblmng_whsal_mrkt_nm','pblmng_whsal_mrkt_cd', 'cpr_nm', 'cpr_cd', 'prdlst_nm', 'prdlst_cd', 'spcies_nm','spcies_cd','grad','grad_cd','delngbundle_qy','stndrd','stndrd_cd','delng_qy','mumm_amt','avrg_amt','mxmm_amt','auc_co']) 18 + df = pd.DataFrame(columns=['row_num','aucng_de', 'pblmng_whsal_mrkt_nm','pblmng_whsal_mrkt_cd', 'cpr_nm', 'cpr_cd', 'prdlst_nm', 'prdlst_cd', 'spcies_nm','spcies_cd','delngbundle_qy','stndrd','stndrd_cd','grad','grad_cd','sanji_cd','sanji_nm','mumm_amt','avrg_amt','mxmm_amt','delng_qy','cnts'])
19 19
20 # 기본 number setting 20 # 기본 number setting
21 i = 0 # 날짜별 row 21 i = 0 # 날짜별 row
...@@ -46,7 +46,7 @@ def makecsvfile(day): ...@@ -46,7 +46,7 @@ def makecsvfile(day):
46 # if result is fine 46 # if result is fine
47 items = soup.find_all('row') 47 items = soup.find_all('row')
48 for item in items: 48 for item in items:
49 - df.loc[i] = [item.row_num.string, item.aucng_de.string, item.pblmng_whsal_mrkt_nm.string, item.pblmng_whsal_mrkt_cd.string, item.cpr_nm.string, item.cpr_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.grad.string, item.grad_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.delng_qy.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.auc_co.string] 49 + df.loc[i] = [item.row_num.string, item.aucng_de.string, item.pblmng_whsal_mrkt_nm.string, item.pblmng_whsal_mrkt_cd.string, item.cpr_nm.string, item.cpr_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.grad.string, item.grad_cd.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.delng_qy.string, item.cnts.string]
50 i += 1 50 i += 1
51 51
52 # 다음 1000개 52 # 다음 1000개
......
1 +# -*- coding: utf-8 -*-
2 +
3 +from bs4 import BeautifulSoup
4 +from urllib.request import urlopen
5 +import pandas as pd
6 +import requests
7 +import os
8 +import datetime
9 +
10 +url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160722000000000352_1/'
11 +
12 +# 원하는 사이트에 맞게 항목만 바꿔주면 되는 함수
13 +def makecsvfile(day):
14 +
15 + # file name setting
16 + output_file = str(day)+'data.csv'
17 +
18 + # dataframe 생성
19 + ########################## data column 변경 필수 ################################
20 + df = pd.DataFrame(columns=['ROW_NUM','EXAMIN_DE','EXAMIN_SE_NM','EXAMIN_SE_CODE','EXAMIN_AREA_NAME','EXAMIN_AREA_CODE','EXAMIN_MRKT_NM'.'EXAMIN_MRKT_CODE','STD_MRKT_NM','STD_MRKT_CODE','EXAMIN_PRDLST_NM','EXAMIN_PRDLST_CODE','EXAMIN_SPCIES_NM','EXAMIN_SPCIES_CODE','STD_LCLAS_NM','STD_LCLAS_CO','STD_PRDLST_NM','STD_PRDLST_CODE','STD_SPCIES_NM','STD_SPCIES_CODE','EXAMIN_UNIT_NM','EXAMIN_UNIT','STD_UNIT_NM','STD_UNIT_CODE','EXAMIN_GRAD_NM','EXAMIN_GRAD_CODE','STD_GRAD_NM','STD_GRAD_CODE','TODAY_PRIC','BFRT_PRIC','IMP_TRADE','TRADE_AMT'])
21 +
22 + # 기본 number setting
23 + i = 0 # 날짜별 row
24 + number = 0
25 +
26 + while(True):
27 +
28 + # url 생성 & data 받아오기
29 + myurl = url + str(number*1000+1) + '/'+str((number+1)*1000) + '?AUCNG_DE='+str(day)
30 + data = urlopen(myurl).read()
31 + soup = BeautifulSoup(data, 'html.parser')
32 +
33 + # data error check
34 + result_code = soup.find('result')
35 + result_code = result_code.code.string
36 + if result_code != 'INFO-000':
37 + print(result_code)
38 + break
39 +
40 + # data number check
41 + start_num = int(str(soup.find('startrow'))[10:-11])
42 + total_num = int(str(soup.find('totalcnt'))[10:-11])
43 + print(str(soup.find('startrow'))[10:-11])
44 + if total_num < start_num :
45 + print('find all')
46 + break
47 +
48 + # if result is fine
49 + items = soup.find_all('row')
50 + for item in items:
51 + ########################################## data column change ##########################################
52 + df.loc[i] = [item.row_num.string,itme.EXAMIN_DE.string,item.EXAMIN_SE_NM.string,item.EXAMIN_SE_CODE.string,item.EXAMIN_AREA_NAME.string,item.EXAMIN_AREA_CODE.string,item.EXAMIN_MRKT_NM.string,item.EXAMIN_MRKT_CODE.string,item.STD_MRKT_NM.string,item.STD_MRKT_CODE.string,item.EXAMIN_PRDLST_NM.string,item.EXAMIN_PRDLST_CODE.string,item.EXAMIN_SPCIES_NM.string,item.EXAMIN_SPCIES_CODE.string,item.STD_LCLAS_NM.string,item.STD_LCLAS_CO.string,item.STD_PRDLST_NM.string,item.STD_PRDLST_CODE.string,item.STD_SPCIES_NM.string,item.STD_SPCIES_CODE.string,item.EXAMIN_UNIT_NM.string,item.EXAMIN_UNIT.string,item.STD_UNIT_NM.string,item.STD_UNIT_CODE.string,item.EXAMIN_GRAD_NM.string,item.EXAMIN_GRAD_CODE.string,item.STD_GRAD_NM.string,item.STD_GRAD_CODE.string,item.TODAY_PRIC.string,item.BFRT_PRIC.string,item.IMP_TRADE.string,item.TRADE_AMT.string]
53 + i += 1
54 +
55 + # 다음 1000개
56 + number += 1
57 +
58 + # 결과 확인을 위한 print
59 + print(str(day), ' : ', str(i))
60 + # csv 파일로 내보내기
61 + ############################################# change saved file directory ####################################
62 + df.to_csv(os.path.join('./jointmarketdata', output_file), encoding='euc-kr', index=False)
63 +
64 +def checkdatanum(day):
65 + myurl = url +'1/1?AUCNG_DE='+str(day)
66 +
67 + req = requests.get(myurl)
68 + html = req.text
69 + soup = BeautifulSoup(html, 'html.parser')
70 + product_num = soup.find('totalcnt')
71 + product_num = int(str(product_num)[10:-11])
72 + print(day,':',product_num)
73 + return product_num
74 +
75 +i = 0
76 +##################################### 시작일 & 종료일 변경 ############################################
77 +last_day = datetime.date(2020,4,30)
78 +first_day = datetime.date(2020,4,15) - datetime.timedelta(days=1)
79 +
80 +while(first_day < last_day):
81 + first_day += datetime.timedelta(days=1)
82 + makecsvfile(first_day.strftime('%Y%m%d'))
...@@ -9,12 +9,14 @@ import datetime ...@@ -9,12 +9,14 @@ import datetime
9 9
10 url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160624000000000349_1/' 10 url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160624000000000349_1/'
11 11
12 +# 원하는 사이트에 맞게 항목만 바꿔주면 되는 함수
12 def makecsvfile(day): 13 def makecsvfile(day):
13 14
14 # file name setting 15 # file name setting
15 output_file = str(day)+'data.csv' 16 output_file = str(day)+'data.csv'
16 17
17 # dataframe 생성 18 # dataframe 생성
19 + ########################## data column 변경 필수 ################################
18 df = pd.DataFrame(columns=['row_num','aucng_de','cpr_nm','cpr_cd','cpr_type_nm','cpr_type_cd','prdlst_nm','prdlst_cd','spcies_nm','spcies_cd','delngbundle_qy','stndrd','stndrd_cd','grad','grad_cd','sanco','sannm','mumm_amt','avrg_amt','mxmm_amt','delng_qy','auc_co']) 20 df = pd.DataFrame(columns=['row_num','aucng_de','cpr_nm','cpr_cd','cpr_type_nm','cpr_type_cd','prdlst_nm','prdlst_cd','spcies_nm','spcies_cd','delngbundle_qy','stndrd','stndrd_cd','grad','grad_cd','sanco','sannm','mumm_amt','avrg_amt','mxmm_amt','delng_qy','auc_co'])
19 21
20 # 기본 number setting 22 # 기본 number setting
...@@ -46,6 +48,7 @@ def makecsvfile(day): ...@@ -46,6 +48,7 @@ def makecsvfile(day):
46 # if result is fine 48 # if result is fine
47 items = soup.find_all('row') 49 items = soup.find_all('row')
48 for item in items: 50 for item in items:
51 + ########################################## data column change ##########################################
49 df.loc[i] = [item.row_num.string, item.aucng_de.string, item.cpr_nm.string, item.cpr_cd.string, item.cpr_type_nm.string, item.cpr_type_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.grad.string, item.grad_cd.string, item.sanco.string, item.sannm.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.delng_qy.string, item.auc_co.string] 52 df.loc[i] = [item.row_num.string, item.aucng_de.string, item.cpr_nm.string, item.cpr_cd.string, item.cpr_type_nm.string, item.cpr_type_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.grad.string, item.grad_cd.string, item.sanco.string, item.sannm.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.delng_qy.string, item.auc_co.string]
50 i += 1 53 i += 1
51 54
...@@ -55,6 +58,7 @@ def makecsvfile(day): ...@@ -55,6 +58,7 @@ def makecsvfile(day):
55 # 결과 확인을 위한 print 58 # 결과 확인을 위한 print
56 print(str(day), ' : ', str(i)) 59 print(str(day), ' : ', str(i))
57 # csv 파일로 내보내기 60 # csv 파일로 내보내기
61 + ############################################# change saved file directory ####################################
58 df.to_csv(os.path.join('./jointmarketdata', output_file), encoding='euc-kr', index=False) 62 df.to_csv(os.path.join('./jointmarketdata', output_file), encoding='euc-kr', index=False)
59 63
60 def checkdatanum(day): 64 def checkdatanum(day):
...@@ -69,8 +73,9 @@ def checkdatanum(day): ...@@ -69,8 +73,9 @@ def checkdatanum(day):
69 return product_num 73 return product_num
70 74
71 i = 0 75 i = 0
72 -last_day = datetime.date(2020,5,5) 76 +##################################### 시작일 & 종료일 변경 ############################################
73 -first_day = datetime.date(2020,5,1) - datetime.timedelta(days=1) 77 +last_day = datetime.date(2020,4,30)
78 +first_day = datetime.date(2020,4,15) - datetime.timedelta(days=1)
74 79
75 while(first_day < last_day): 80 while(first_day < last_day):
76 first_day += datetime.timedelta(days=1) 81 first_day += datetime.timedelta(days=1)
......