main.py
917 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import crawler
import os
import utils
TOKEN = 'YOUR_TOKEN_HERE'
DATASET_DIR = 'YOUR_PATH_HERE'
REPO_PATH = 'repos.txt'
utils.removeEmptyDirectories(DATASET_DIR)
c = crawler.GithubCrawler(TOKEN)
if not os.path.exists(REPO_PATH):
repos = c.search_repo('MNIST+language:python', 1000, 2000)
f = open(REPO_PATH, 'w')
for r in repos:
f.write(r + '\n')
f.close()
else:
f = open(REPO_PATH, 'r')
repos = f.readlines()
f.close()
S = 0
L = len(repos)
print("Found repositories:", L)
for i in range(S, L):
r = repos[i].strip()
savename = r.replace('/', '_')
print('Downloading', f'[{i}] :', savename)
if os.path.exists(os.path.join(DATASET_DIR, savename)):
continue
files = c.search_files(r, True)
files = list(filter(lambda x : utils.isformat(x, ['py', 'ipynb']), files))
if len(files) > 0:
utils.downloadFiles(DATASET_DIR, savename, files)