Showing
4 changed files
with
185 additions
and
0 deletions
jaksimsamil-crawler/.gitignore
0 → 100644
jaksimsamil-crawler/README.md
0 → 100644
1 | +# Jaksimsamil Crawler Documentation | ||
2 | + | ||
3 | +## Overview | ||
4 | +- https://acmicpc.net와 https://solved.ac에서 사용자와 문제정보를 크롤링합니다. | ||
5 | +- Python 3.8.3, Pip 20.2.1 환경에서 개발되었습니다. | ||
6 | + | ||
7 | +## Usuage | ||
8 | + | ||
9 | +- Install | ||
10 | +```bash | ||
11 | +pip install -r requirements.txt | ||
12 | +``` | ||
13 | + | ||
14 | +- Run | ||
15 | +```bash | ||
16 | +python main.py | ||
17 | +``` |
jaksimsamil-crawler/main.py
0 → 100644
1 | +import requests | ||
2 | +from bs4 import BeautifulSoup | ||
3 | +import pandas as pd | ||
4 | +from dotenv import load_dotenv | ||
5 | +import sys | ||
6 | +import pymongo | ||
7 | +import os | ||
8 | +from datetime import datetime | ||
9 | +import json | ||
10 | +import numpy as np | ||
11 | + | ||
12 | +SAVE_EVERY=10 | ||
13 | +SAVE_PATH='problems.csv' | ||
14 | +def setup(): | ||
15 | + try: | ||
16 | + load_dotenv(dotenv_path='../jaksimsamil-server/.env') | ||
17 | + client=pymongo.MongoClient('/'.join(os.getenv('MONGO_URL').split('/')[:-1])) | ||
18 | + print('MongoDB Connected') | ||
19 | + return client | ||
20 | + except FileNotFoundError: | ||
21 | + print('.env is not found',file=sys.stderr) | ||
22 | + exit(1) | ||
23 | + | ||
24 | +def save(df,path='problems.csv'): | ||
25 | + print('Saving to {}...'.format(path),end='') | ||
26 | + df.to_csv(path) | ||
27 | + print('Done.') | ||
28 | + | ||
29 | +def load(path='problems.csv'): | ||
30 | + problems=pd.read_csv(path,index_col=0) | ||
31 | + return problems | ||
32 | + | ||
33 | +def get_khu_problem_list(): | ||
34 | + pageNum=1 | ||
35 | + idx=0 | ||
36 | + problems=pd.DataFrame(columns=['problemNum','problemTitle','solvedacLevel','submitNum','correctNum','category','count']) | ||
37 | + while True: | ||
38 | + res=requests.get('https://www.acmicpc.net/school/ranklist/211/{}'.format(pageNum)) | ||
39 | + status_code=res.status_code | ||
40 | + if status_code==404: | ||
41 | + break | ||
42 | + soup=BeautifulSoup(res.text,'html.parser') | ||
43 | + userlinks=soup.select('#ranklist > tbody > tr > td:nth-child(2) > a') | ||
44 | + for userlink in userlinks: | ||
45 | + href=userlink['href'] | ||
46 | + res=requests.get('https://acmicpc.net'+href) | ||
47 | + print('Collecting user data...:',href.split('/')[-1]) | ||
48 | + user_soup=BeautifulSoup(res.text,'html.parser') | ||
49 | + problemNums=user_soup.select('body > div.wrapper > div.container.content > div.row > div:nth-child(2) > div:nth-child(3) > div.col-md-9 > div:nth-child(1) > div.panel-body > span.problem_number') | ||
50 | + for problemNum in problemNums: | ||
51 | + if not problemNum.text in problems['problemNum'].tolist(): | ||
52 | + problems=problems.append({'problemNum':problemNum.text,'count':1},ignore_index=True) | ||
53 | + else: | ||
54 | + problems.loc[problems.problemNum==problemNum.text,'count']=problems.loc[problems.problemNum==problemNum.text,'count']+1 | ||
55 | + if idx%SAVE_EVERY==0: | ||
56 | + save(problems,SAVE_PATH) | ||
57 | + idx+=1 | ||
58 | + pageNum+=1 | ||
59 | + save(problems,SAVE_PATH) | ||
60 | + return problems | ||
61 | + | ||
62 | +def get_problem_info(problems): | ||
63 | + for idx,problemNum in enumerate(problems['problemNum'].values): | ||
64 | + res=requests.get('https://acmicpc.net/problem/{}'.format(problemNum)) | ||
65 | + print('Collecting problem data...:',problemNum) | ||
66 | + soup=BeautifulSoup(res.text,'html.parser') | ||
67 | + problemTitle=soup.select('#problem_title')[0].text | ||
68 | + soup=soup.select('#problem-info > tbody > tr > td') | ||
69 | + submitNum=soup[2].text | ||
70 | + correctNum=soup[4].text | ||
71 | + problems.loc[problems.problemNum==problemNum,'problemTitle']=problemTitle | ||
72 | + problems.loc[problems.problemNum==problemNum,'submitNum']=submitNum | ||
73 | + problems.loc[problems.problemNum==problemNum,'correctNum']=correctNum | ||
74 | + if idx%SAVE_EVERY==0: | ||
75 | + save(problems,SAVE_PATH) | ||
76 | + save(problems,SAVE_PATH) | ||
77 | + return problems | ||
78 | + | ||
79 | +def get_solvedac_level(problems): | ||
80 | + for idx,problemNum in enumerate(problems['problemNum'].values): | ||
81 | + res=requests.get('https://api.solved.ac/v2/search/problems.json?query={}&page=1&sort=id&sort_direction=ascending'.format(problemNum)) | ||
82 | + print('Collecting solved.ac level data...:',problemNum) | ||
83 | + result=json.loads(res.text) | ||
84 | + for problem in result['result']['problems']: | ||
85 | + if int(problem['id'])==int(problemNum): | ||
86 | + problems.loc[problems.problemNum==problemNum,'solvedacLevel']=problem['level'] | ||
87 | + break | ||
88 | + if idx%SAVE_EVERY==0: | ||
89 | + save(problems,SAVE_PATH) | ||
90 | + save(problems,SAVE_PATH) | ||
91 | + return problems | ||
92 | + | ||
93 | +def get_category(problems): | ||
94 | + problems.sort_values(['problemNum'],inplace=True,ignore_index=True) | ||
95 | + problems['category']=problems['category'].fillna(json.dumps([])) | ||
96 | + pageNum=1 | ||
97 | + res=requests.get('https://api.solved.ac/v2/tags/stats.json?page={}'.format(pageNum)) | ||
98 | + tagsResult=json.loads(res.text) | ||
99 | + totalPages=tagsResult['result']['total_page'] | ||
100 | + tags=[] | ||
101 | + tags.extend(tagsResult['result']['tags']) | ||
102 | + for pageNum in range(2,totalPages+1): | ||
103 | + res=requests.get('https://api.solved.ac/v2/tags/stats.json?page={}'.format(pageNum)) | ||
104 | + tagsResult=json.loads(res.text) | ||
105 | + tags.extend(tagsResult['result']['tags']) | ||
106 | + print('total tags:',len(tags)) | ||
107 | + for tag in tags: | ||
108 | + problemList=[] | ||
109 | + pageNum=1 | ||
110 | + res=requests.get('https://api.solved.ac/v2/search/problems.json?query=solvable:true+tag:{}&page={}&sort=id&sort_direction=ascending'.format(tag['tag_name'],pageNum)) | ||
111 | + problemResult=json.loads(res.text) | ||
112 | + totalPages=problemResult['result']['total_page'] | ||
113 | + problemList.extend(problemResult['result']['problems']) | ||
114 | + for pageNum in range(2,totalPages+1): | ||
115 | + res=requests.get('https://api.solved.ac/v2/search/problems.json?query=solvable:true+tag:{}&page={}&sort=id&sort_direction=ascending'.format(tag['tag_name'],pageNum)) | ||
116 | + problemResult=json.loads(res.text) | ||
117 | + problemList.extend(problemResult['result']['problems']) | ||
118 | + idx=0 | ||
119 | + problemListLen=len(problemList) | ||
120 | + for problemNum in problems['problemNum'].values: | ||
121 | + if idx<problemListLen and int(problemList[idx]['id'])==int(problemNum): | ||
122 | + category=json.loads(problems.loc[problems.problemNum==problemNum,'category'].values[0]) | ||
123 | + category.append(tag['full_name_ko']) | ||
124 | + problems.loc[problems.problemNum==problemNum,'category']=json.dumps(category,ensure_ascii=False) | ||
125 | + idx+=1 | ||
126 | + print('Problem {} in category {}'.format(problemNum,tag['full_name_ko'])) | ||
127 | + save(problems,SAVE_PATH) | ||
128 | + return problems | ||
129 | + | ||
130 | + | ||
131 | +def update_database(problems,client): | ||
132 | + database=client['jaksimsamil'] | ||
133 | + collection=database['problem'] | ||
134 | + dictedProblems=problems.to_dict('records') | ||
135 | + print('len of records:',len(dictedProblems)) | ||
136 | + for dictedProblem in dictedProblems: | ||
137 | + dictedProblem['category']=json.loads(dictedProblem['category']) | ||
138 | + collection.update_one({'problemNum':dictedProblem['problemNum']},{'$set':dictedProblem},upsert=True) | ||
139 | + | ||
140 | + | ||
141 | +if __name__=="__main__": | ||
142 | + startTime=datetime.now() | ||
143 | + client=setup() | ||
144 | + problems=get_khu_problem_list() | ||
145 | + problems=get_problem_info(problems) | ||
146 | + problems=get_solvedac_level(problems) | ||
147 | + problems=get_category(problems) | ||
148 | + update_database(problems,client) | ||
149 | + print('Time elapsed :',(datetime.now()-startTime)/60,'mins') | ||
150 | + |
jaksimsamil-crawler/requirements.txt
0 → 100644
-
Please register or login to post a comment