python file for data crawling

양선아
Commit 16ccdb0d96de61ea54a5c07adf0dcd41a13a94a9 16ccdb0d 1 parent 1e7682ec
Showing 3 changed files with 92 additions and 5 deletions
data/auctiondatacraw.py
data/distributiondata.py
data/jointmarketdata.py
--- a/data/auctiondatacraw.py
View file @16ccdb0
+++ b/data/auctiondatacraw.py
View file @16ccdb0
@@ -7,7 +7,7 @@ import requests
 import os
 import datetime
 
- url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20141119000000000012_1/'
+ url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160624000000000348_1/'
 
 def makecsvfile(day):
 
@@ -15,7 +15,7 @@ def makecsvfile(day):
     output_file = str(day)+'data.csv'
 
     # dataframe 생성
-     df = pd.DataFrame(columns=['row_num','aucng_de', 'pblmng_whsal_mrkt_nm','pblmng_whsal_mrkt_cd', 'cpr_nm', 'cpr_cd', 'prdlst_nm', 'prdlst_cd', 'spcies_nm','spcies_cd','grad','grad_cd','delngbundle_qy','stndrd','stndrd_cd','delng_qy','mumm_amt','avrg_amt','mxmm_amt','auc_co'])
+     df = pd.DataFrame(columns=['row_num','aucng_de', 'pblmng_whsal_mrkt_nm','pblmng_whsal_mrkt_cd', 'cpr_nm', 'cpr_cd', 'prdlst_nm', 'prdlst_cd', 'spcies_nm','spcies_cd','delngbundle_qy','stndrd','stndrd_cd','grad','grad_cd','sanji_cd','sanji_nm','mumm_amt','avrg_amt','mxmm_amt','delng_qy','cnts'])
     
     # 기본 number setting
     i = 0 # 날짜별 row
@@ -46,7 +46,7 @@ def makecsvfile(day):
         # if result is fine
         items = soup.find_all('row')
         for item in items:
-             df.loc[i] = [item.row_num.string, item.aucng_de.string, item.pblmng_whsal_mrkt_nm.string, item.pblmng_whsal_mrkt_cd.string, item.cpr_nm.string, item.cpr_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.grad.string, item.grad_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.delng_qy.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.auc_co.string]
+             df.loc[i] = [item.row_num.string, item.aucng_de.string, item.pblmng_whsal_mrkt_nm.string, item.pblmng_whsal_mrkt_cd.string, item.cpr_nm.string, item.cpr_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.grad.string, item.grad_cd.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.delng_qy.string, item.cnts.string]
             i += 1
         
         # 다음 1000개
--- a/data/distributiondata.py 0 → 100644
View file @16ccdb0
+++ b/data/distributiondata.py 0 → 100644
View file @16ccdb0
+ # -*- coding: utf-8 -*-
+ 
+ from bs4 import BeautifulSoup
+ from urllib.request import urlopen
+ import pandas as pd
+ import requests
+ import os
+ import datetime
+ 
+ url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160722000000000352_1/'
+ 
+ # 원하는 사이트에 맞게 항목만 바꿔주면 되는 함수
+ def makecsvfile(day):
+ 
+     # file name setting
+     output_file = str(day)+'data.csv'
+ 
+     # dataframe 생성
+     ########################## data column 변경 필수 ################################
+     df = pd.DataFrame(columns=['ROW_NUM','EXAMIN_DE','EXAMIN_SE_NM','EXAMIN_SE_CODE','EXAMIN_AREA_NAME','EXAMIN_AREA_CODE','EXAMIN_MRKT_NM'.'EXAMIN_MRKT_CODE','STD_MRKT_NM','STD_MRKT_CODE','EXAMIN_PRDLST_NM','EXAMIN_PRDLST_CODE','EXAMIN_SPCIES_NM','EXAMIN_SPCIES_CODE','STD_LCLAS_NM','STD_LCLAS_CO','STD_PRDLST_NM','STD_PRDLST_CODE','STD_SPCIES_NM','STD_SPCIES_CODE','EXAMIN_UNIT_NM','EXAMIN_UNIT','STD_UNIT_NM','STD_UNIT_CODE','EXAMIN_GRAD_NM','EXAMIN_GRAD_CODE','STD_GRAD_NM','STD_GRAD_CODE','TODAY_PRIC','BFRT_PRIC','IMP_TRADE','TRADE_AMT'])
+     
+     # 기본 number setting
+     i = 0 # 날짜별 row
+     number = 0
+     
+     while(True):
+ 
+         # url 생성 & data 받아오기
+         myurl = url + str(number*1000+1) + '/'+str((number+1)*1000) + '?AUCNG_DE='+str(day)
+         data = urlopen(myurl).read()
+         soup = BeautifulSoup(data, 'html.parser')
+ 
+         # data error check
+         result_code = soup.find('result')
+         result_code = result_code.code.string
+         if result_code != 'INFO-000':
+             print(result_code)
+             break
+         
+         # data number check
+         start_num = int(str(soup.find('startrow'))[10:-11])
+         total_num = int(str(soup.find('totalcnt'))[10:-11])
+         print(str(soup.find('startrow'))[10:-11])
+         if total_num < start_num :
+             print('find all')
+             break
+ 
+         # if result is fine
+         items = soup.find_all('row')
+         for item in items:
+             ########################################## data column change ##########################################
+             df.loc[i] = [item.row_num.string,itme.EXAMIN_DE.string,item.EXAMIN_SE_NM.string,item.EXAMIN_SE_CODE.string,item.EXAMIN_AREA_NAME.string,item.EXAMIN_AREA_CODE.string,item.EXAMIN_MRKT_NM.string,item.EXAMIN_MRKT_CODE.string,item.STD_MRKT_NM.string,item.STD_MRKT_CODE.string,item.EXAMIN_PRDLST_NM.string,item.EXAMIN_PRDLST_CODE.string,item.EXAMIN_SPCIES_NM.string,item.EXAMIN_SPCIES_CODE.string,item.STD_LCLAS_NM.string,item.STD_LCLAS_CO.string,item.STD_PRDLST_NM.string,item.STD_PRDLST_CODE.string,item.STD_SPCIES_NM.string,item.STD_SPCIES_CODE.string,item.EXAMIN_UNIT_NM.string,item.EXAMIN_UNIT.string,item.STD_UNIT_NM.string,item.STD_UNIT_CODE.string,item.EXAMIN_GRAD_NM.string,item.EXAMIN_GRAD_CODE.string,item.STD_GRAD_NM.string,item.STD_GRAD_CODE.string,item.TODAY_PRIC.string,item.BFRT_PRIC.string,item.IMP_TRADE.string,item.TRADE_AMT.string]
+             i += 1
+         
+         # 다음 1000개
+         number += 1
+ 
+     # 결과 확인을 위한 print
+     print(str(day), ' : ', str(i))
+     # csv 파일로 내보내기
+     ############################################# change saved file directory ####################################
+     df.to_csv(os.path.join('./jointmarketdata', output_file), encoding='euc-kr', index=False)
+ 
+ def checkdatanum(day):
+     myurl = url +'1/1?AUCNG_DE='+str(day)
+ 
+     req = requests.get(myurl) 
+     html = req.text
+     soup = BeautifulSoup(html, 'html.parser')
+     product_num = soup.find('totalcnt')
+     product_num = int(str(product_num)[10:-11])
+     print(day,':',product_num)
+     return product_num
+ 
+ i = 0
+ ##################################### 시작일 & 종료일 변경 ############################################
+ last_day = datetime.date(2020,4,30)
+ first_day = datetime.date(2020,4,15) - datetime.timedelta(days=1)
+ 
+ while(first_day < last_day):
+     first_day += datetime.timedelta(days=1)
+     makecsvfile(first_day.strftime('%Y%m%d'))
--- a/data/jointmarketdata.py
View file @16ccdb0
+++ b/data/jointmarketdata.py
View file @16ccdb0
@@ -9,12 +9,14 @@ import datetime
 
 url = 'http://211.237.50.150:7080/openapi/5e5e94876b673efe7161d3240516d65476da16210a391a9d6f31224c54a1fdaf/xml/Grid_20160624000000000349_1/'
 
+ # 원하는 사이트에 맞게 항목만 바꿔주면 되는 함수
 def makecsvfile(day):
 
     # file name setting
     output_file = str(day)+'data.csv'
 
     # dataframe 생성
+     ########################## data column 변경 필수 ################################
     df = pd.DataFrame(columns=['row_num','aucng_de','cpr_nm','cpr_cd','cpr_type_nm','cpr_type_cd','prdlst_nm','prdlst_cd','spcies_nm','spcies_cd','delngbundle_qy','stndrd','stndrd_cd','grad','grad_cd','sanco','sannm','mumm_amt','avrg_amt','mxmm_amt','delng_qy','auc_co'])
     
     # 기본 number setting
@@ -46,6 +48,7 @@ def makecsvfile(day):
         # if result is fine
         items = soup.find_all('row')
         for item in items:
+             ########################################## data column change ##########################################
             df.loc[i] = [item.row_num.string, item.aucng_de.string, item.cpr_nm.string, item.cpr_cd.string, item.cpr_type_nm.string, item.cpr_type_cd.string, item.prdlst_nm.string, item.prdlst_cd.string, item.spcies_nm.string, item.spcies_cd.string, item.delngbundle_qy.string, item.stndrd.string, item.stndrd_cd.string, item.grad.string, item.grad_cd.string, item.sanco.string, item.sannm.string, item.mumm_amt.string, item.avrg_amt.string, item.mxmm_amt.string, item.delng_qy.string, item.auc_co.string]
             i += 1
         
@@ -55,6 +58,7 @@ def makecsvfile(day):
     # 결과 확인을 위한 print
     print(str(day), ' : ', str(i))
     # csv 파일로 내보내기
+     ############################################# change saved file directory ####################################
     df.to_csv(os.path.join('./jointmarketdata', output_file), encoding='euc-kr', index=False)
 
 def checkdatanum(day):
@@ -69,8 +73,9 @@ def checkdatanum(day):
     return product_num
 
 i = 0
- last_day = datetime.date(2020,5,5)
- first_day = datetime.date(2020,5,1) - datetime.timedelta(days=1)
+ ##################################### 시작일 & 종료일 변경 ############################################
+ last_day = datetime.date(2020,4,30)
+ first_day = datetime.date(2020,4,15) - datetime.timedelta(days=1)
 
 while(first_day < last_day):
     first_day += datetime.timedelta(days=1)