JJuOn

Implement python code of crawling problem related data

1 +jaksimsamil
2 +./vscode
3 +*.csv
1 +# Jaksimsamil Crawler Documentation
2 +
3 +## Overview
4 +- https://acmicpc.net와 https://solved.ac에서 사용자와 문제정보를 크롤링합니다.
5 +- Python 3.8.3, Pip 20.2.1 환경에서 개발되었습니다.
6 +
7 +## Usuage
8 +
9 +- Install
10 +```bash
11 +pip install -r requirements.txt
12 +```
13 +
14 +- Run
15 +```bash
16 +python main.py
17 +```
1 +import requests
2 +from bs4 import BeautifulSoup
3 +import pandas as pd
4 +from dotenv import load_dotenv
5 +import sys
6 +import pymongo
7 +import os
8 +from datetime import datetime
9 +import json
10 +import numpy as np
11 +
12 +SAVE_EVERY=10
13 +SAVE_PATH='problems.csv'
14 +def setup():
15 + try:
16 + load_dotenv(dotenv_path='../jaksimsamil-server/.env')
17 + client=pymongo.MongoClient('/'.join(os.getenv('MONGO_URL').split('/')[:-1]))
18 + print('MongoDB Connected')
19 + return client
20 + except FileNotFoundError:
21 + print('.env is not found',file=sys.stderr)
22 + exit(1)
23 +
24 +def save(df,path='problems.csv'):
25 + print('Saving to {}...'.format(path),end='')
26 + df.to_csv(path)
27 + print('Done.')
28 +
29 +def get_khu_problem_list():
30 + pageNum=1
31 + idx=0
32 + problems=pd.DataFrame(columns=['problemNum','problemTitle','solvedacLevel','submitNum','correctNum','category','count'])
33 + while True:
34 + res=requests.get('https://www.acmicpc.net/school/ranklist/211/{}'.format(pageNum))
35 + status_code=res.status_code
36 + if status_code==404:
37 + break
38 + soup=BeautifulSoup(res.text,'html.parser')
39 + userlinks=soup.select('#ranklist > tbody > tr > td:nth-child(2) > a')
40 + for userlink in userlinks:
41 + href=userlink['href']
42 + res=requests.get('https://acmicpc.net'+href)
43 + print('Collecting user data...:',href.split('/')[-1])
44 + user_soup=BeautifulSoup(res.text,'html.parser')
45 + problemNums=user_soup.select('body > div.wrapper > div.container.content > div.row > div:nth-child(2) > div:nth-child(3) > div.col-md-9 > div:nth-child(1) > div.panel-body > span.problem_number')
46 + for problemNum in problemNums:
47 + if not problemNum.text in problems['problemNum'].tolist():
48 + problems=problems.append({'problemNum':problemNum.text,'count':1},ignore_index=True)
49 + else:
50 + problems.loc[problems.problemNum==problemNum.text,'count']=problems.loc[problems.problemNum==problemNum.text,'count']+1
51 + if idx%SAVE_EVERY==0:
52 + save(problems,SAVE_PATH)
53 + idx+=1
54 + pageNum+=1
55 + save(problems,SAVE_PATH)
56 + return problems
57 +
58 +def get_problem_info(problems):
59 + for idx,problemNum in enumerate(problems['problemNum'].values):
60 + res=requests.get('https://acmicpc.net/problem/{}'.format(problemNum))
61 + print('Collecting problem data...:',problemNum)
62 + soup=BeautifulSoup(res.text,'html.parser')
63 + problemTitle=soup.select('#problem_title')[0].text
64 + soup=soup.select('#problem-info > tbody > tr > td')
65 + submitNum=soup[2].text
66 + correctNum=soup[4].text
67 + problems.loc[problems.problemNum==problemNum,'problemTitle']=problemTitle
68 + problems.loc[problems.problemNum==problemNum,'submitNum']=submitNum
69 + problems.loc[problems.problemNum==problemNum,'correctNum']=correctNum
70 + if idx%SAVE_EVERY==0:
71 + save(problems,SAVE_PATH)
72 + save(problems,SAVE_PATH)
73 + return problems
74 +
75 +def get_solvedac_level(problems):
76 + for idx,problemNum in enumerate(problems['problemNum'].values):
77 + res=requests.get('https://api.solved.ac/v2/search/problems.json?query={}&page=1&sort=id&sort_direction=ascending'.format(problemNum))
78 + print('Collecting solved.ac level data...:',problemNum)
79 + result=json.loads(res.text)
80 + for problem in result['result']['problems']:
81 + if problem['id']==problemNum:
82 + problems.loc[problems.problemNum==problemNum,'solvedacLevel']=problem['level']
83 + break
84 + if idx%SAVE_EVERY==0:
85 + save(problems,SAVE_PATH)
86 + save(problems,SAVE_PATH)
87 + return problems
88 +
89 +def get_category(problems):
90 + problems.sort_values(['problemNum'],inplace=True,ignore_index=True)
91 + problems['category']=problems['category'].fillna(json.dumps([]))
92 + pageNum=1
93 + res=requests.get('https://api.solved.ac/v2/tags/stats.json?page={}'.format(pageNum))
94 + tagsResult=json.loads(res.text)
95 + totalPages=tagsResult['result']['total_page']
96 + tags=[]
97 + tags.extend(tagsResult['result']['tags'])
98 + for pageNum in range(2,totalPages+1):
99 + res=requests.get('https://api.solved.ac/v2/tags/stats.json?page={}'.format(pageNum))
100 + tagsResult=json.loads(res.text)
101 + tags.extend(tagsResult['result']['tags'])
102 + print('total tags:',len(tags))
103 + for tag in tags:
104 + problemList=[]
105 + pageNum=1
106 + res=requests.get('https://api.solved.ac/v2/search/problems.json?query=solvable:true+tag:{}&page={}&sort=id&sort_direction=ascending'.format(tag['tag_name'],pageNum))
107 + problemResult=json.loads(res.text)
108 + totalPages=problemResult['result']['total_page']
109 + problemList.extend(problemResult['result']['problems'])
110 + for pageNum in range(2,totalPages+1):
111 + res=requests.get('https://api.solved.ac/v2/search/problems.json?query=solvable:true+tag:{}&page={}&sort=id&sort_direction=ascending'.format(tag['tag_name'],pageNum))
112 + problemResult=json.loads(res.text)
113 + problemList.extend(problemResult['result']['problems'])
114 + idx=0
115 + problemListLen=len(problemList)
116 + for problemNum in problems['problemNum'].values:
117 + if idx<problemListLen and problemList[idx]['id']==problemNum:
118 + category=json.loads(problems.loc[problems.problemNum==problemNum,'category'].values[0])
119 + category.append(tag['full_name_ko'])
120 + problems.loc[problems.problemNum==problemNum,'category']=json.dumps(category,ensure_ascii=False)
121 + idx+=1
122 + print('Problem {} in category {}'.format(problemNum,tag['full_name_ko']))
123 + save(problems,SAVE_PATH)
124 + return problems
125 +
126 +
127 +def update_database(problems,client):
128 + database=client['jaksimsamil']
129 + collection=database['problem']
130 + dictedProblems=problems.to_dict('records')
131 + print('len of records:',len(dictedProblems))
132 + for dictedProblem in dictedProblems:
133 + dictedProblem['category']=json.loads(dictedProblem['category'])
134 + collection.update_one({'problemNum':dictedProblem['problemNum']},{'$set':dictedProblem},upsert=True)
135 +
136 +
137 +if __name__=="__main__":
138 + startTime=datetime.now()
139 + client=setup()
140 + problems=get_khu_problem_list()
141 + problems=get_problem_info(problems)
142 + problems=pd.read_csv('problems.csv',index_col=0)
143 + problems=get_solvedac_level(problems)
144 + problems=get_category(problems)
145 + update_database(problems,client)
146 + print('Time elapsed :',(datetime.now()-startTime)/60,'mins')
147 +
1 +beautifulsoup4==4.9.1
2 +bs4==0.0.1
3 +certifi==2020.6.20
4 +chardet==3.0.4
5 +idna==2.10
6 +numpy==1.19.1
7 +pandas==1.1.0
8 +pymongo==3.11.0
9 +python-dateutil==2.8.1
10 +python-dotenv==0.14.0
11 +pytz==2020.1
12 +requests==2.24.0
13 +six==1.15.0
14 +soupsieve==2.0.1
15 +urllib3==1.25.10