송용우
Committed by GitHub

Merge pull request #11 from FacerAin/feature/crawling

Implement python code of crawling problem related data (comparison error fixed) 
1 +jaksimsamil
2 +.vscode/
3 +*.csv
1 +# Jaksimsamil Crawler Documentation
2 +
3 +## Overview
4 +- https://acmicpc.net와 https://solved.ac에서 사용자와 문제정보를 크롤링합니다.
5 +- Python 3.8.3, Pip 20.2.1 환경에서 개발되었습니다.
6 +
7 +## Usuage
8 +
9 +- Install
10 +```bash
11 +pip install -r requirements.txt
12 +```
13 +
14 +- Run
15 +```bash
16 +python main.py
17 +```
1 +import requests
2 +from bs4 import BeautifulSoup
3 +import pandas as pd
4 +from dotenv import load_dotenv
5 +import sys
6 +import pymongo
7 +import os
8 +from datetime import datetime
9 +import json
10 +import numpy as np
11 +
12 +SAVE_EVERY=10
13 +SAVE_PATH='problems.csv'
14 +def setup():
15 + try:
16 + load_dotenv(dotenv_path='../jaksimsamil-server/.env')
17 + client=pymongo.MongoClient('/'.join(os.getenv('MONGO_URL').split('/')[:-1]))
18 + print('MongoDB Connected')
19 + return client
20 + except FileNotFoundError:
21 + print('.env is not found',file=sys.stderr)
22 + exit(1)
23 +
24 +def save(df,path='problems.csv'):
25 + print('Saving to {}...'.format(path),end='')
26 + df.to_csv(path)
27 + print('Done.')
28 +
29 +def load(path='problems.csv'):
30 + problems=pd.read_csv(path,index_col=0)
31 + return problems
32 +
33 +def get_khu_problem_list():
34 + pageNum=1
35 + idx=0
36 + problems=pd.DataFrame(columns=['problemNum','problemTitle','solvedacLevel','submitNum','correctNum','category','count'])
37 + while True:
38 + res=requests.get('https://www.acmicpc.net/school/ranklist/211/{}'.format(pageNum))
39 + status_code=res.status_code
40 + if status_code==404:
41 + break
42 + soup=BeautifulSoup(res.text,'html.parser')
43 + userlinks=soup.select('#ranklist > tbody > tr > td:nth-child(2) > a')
44 + for userlink in userlinks:
45 + href=userlink['href']
46 + res=requests.get('https://acmicpc.net'+href)
47 + print('Collecting user data...:',href.split('/')[-1])
48 + user_soup=BeautifulSoup(res.text,'html.parser')
49 + problemNums=user_soup.select('body > div.wrapper > div.container.content > div.row > div:nth-child(2) > div:nth-child(3) > div.col-md-9 > div:nth-child(1) > div.panel-body > span.problem_number')
50 + for problemNum in problemNums:
51 + if not problemNum.text in problems['problemNum'].tolist():
52 + problems=problems.append({'problemNum':problemNum.text,'count':1},ignore_index=True)
53 + else:
54 + problems.loc[problems.problemNum==problemNum.text,'count']=problems.loc[problems.problemNum==problemNum.text,'count']+1
55 + if idx%SAVE_EVERY==0:
56 + save(problems,SAVE_PATH)
57 + idx+=1
58 + pageNum+=1
59 + save(problems,SAVE_PATH)
60 + return problems
61 +
62 +def get_problem_info(problems):
63 + for idx,problemNum in enumerate(problems['problemNum'].values):
64 + res=requests.get('https://acmicpc.net/problem/{}'.format(problemNum))
65 + print('Collecting problem data...:',problemNum)
66 + soup=BeautifulSoup(res.text,'html.parser')
67 + problemTitle=soup.select('#problem_title')[0].text
68 + soup=soup.select('#problem-info > tbody > tr > td')
69 + submitNum=soup[2].text
70 + correctNum=soup[4].text
71 + problems.loc[problems.problemNum==problemNum,'problemTitle']=problemTitle
72 + problems.loc[problems.problemNum==problemNum,'submitNum']=submitNum
73 + problems.loc[problems.problemNum==problemNum,'correctNum']=correctNum
74 + if idx%SAVE_EVERY==0:
75 + save(problems,SAVE_PATH)
76 + save(problems,SAVE_PATH)
77 + return problems
78 +
79 +def get_solvedac_level(problems):
80 + for idx,problemNum in enumerate(problems['problemNum'].values):
81 + res=requests.get('https://api.solved.ac/v2/search/problems.json?query={}&page=1&sort=id&sort_direction=ascending'.format(problemNum))
82 + print('Collecting solved.ac level data...:',problemNum)
83 + result=json.loads(res.text)
84 + for problem in result['result']['problems']:
85 + if int(problem['id'])==int(problemNum):
86 + problems.loc[problems.problemNum==problemNum,'solvedacLevel']=problem['level']
87 + break
88 + if idx%SAVE_EVERY==0:
89 + save(problems,SAVE_PATH)
90 + save(problems,SAVE_PATH)
91 + return problems
92 +
93 +def get_category(problems):
94 + problems.sort_values(['problemNum'],inplace=True,ignore_index=True)
95 + problems['category']=problems['category'].fillna(json.dumps([]))
96 + pageNum=1
97 + res=requests.get('https://api.solved.ac/v2/tags/stats.json?page={}'.format(pageNum))
98 + tagsResult=json.loads(res.text)
99 + totalPages=tagsResult['result']['total_page']
100 + tags=[]
101 + tags.extend(tagsResult['result']['tags'])
102 + for pageNum in range(2,totalPages+1):
103 + res=requests.get('https://api.solved.ac/v2/tags/stats.json?page={}'.format(pageNum))
104 + tagsResult=json.loads(res.text)
105 + tags.extend(tagsResult['result']['tags'])
106 + print('total tags:',len(tags))
107 + for tag in tags:
108 + problemList=[]
109 + pageNum=1
110 + res=requests.get('https://api.solved.ac/v2/search/problems.json?query=solvable:true+tag:{}&page={}&sort=id&sort_direction=ascending'.format(tag['tag_name'],pageNum))
111 + problemResult=json.loads(res.text)
112 + totalPages=problemResult['result']['total_page']
113 + problemList.extend(problemResult['result']['problems'])
114 + for pageNum in range(2,totalPages+1):
115 + res=requests.get('https://api.solved.ac/v2/search/problems.json?query=solvable:true+tag:{}&page={}&sort=id&sort_direction=ascending'.format(tag['tag_name'],pageNum))
116 + problemResult=json.loads(res.text)
117 + problemList.extend(problemResult['result']['problems'])
118 + idx=0
119 + problemListLen=len(problemList)
120 + for problemNum in problems['problemNum'].values:
121 + if idx<problemListLen and int(problemList[idx]['id'])==int(problemNum):
122 + category=json.loads(problems.loc[problems.problemNum==problemNum,'category'].values[0])
123 + category.append(tag['full_name_ko'])
124 + problems.loc[problems.problemNum==problemNum,'category']=json.dumps(category,ensure_ascii=False)
125 + idx+=1
126 + print('Problem {} in category {}'.format(problemNum,tag['full_name_ko']))
127 + save(problems,SAVE_PATH)
128 + return problems
129 +
130 +
131 +def update_database(problems,client):
132 + database=client['jaksimsamil']
133 + collection=database['problem']
134 + dictedProblems=problems.to_dict('records')
135 + print('len of records:',len(dictedProblems))
136 + for dictedProblem in dictedProblems:
137 + dictedProblem['category']=json.loads(dictedProblem['category'])
138 + collection.update_one({'problemNum':dictedProblem['problemNum']},{'$set':dictedProblem},upsert=True)
139 +
140 +
141 +if __name__=="__main__":
142 + startTime=datetime.now()
143 + client=setup()
144 + problems=get_khu_problem_list()
145 + problems=get_problem_info(problems)
146 + problems=get_solvedac_level(problems)
147 + problems=get_category(problems)
148 + update_database(problems,client)
149 + print('Time elapsed :',(datetime.now()-startTime)/60,'mins')
150 +
1 +beautifulsoup4==4.9.1
2 +bs4==0.0.1
3 +certifi==2020.6.20
4 +chardet==3.0.4
5 +idna==2.10
6 +numpy==1.19.1
7 +pandas==1.1.0
8 +pymongo==3.11.0
9 +python-dateutil==2.8.1
10 +python-dotenv==0.14.0
11 +pytz==2020.1
12 +requests==2.24.0
13 +six==1.15.0
14 +soupsieve==2.0.1
15 +urllib3==1.25.10