main.py 917 Bytes

Raw Blame History Permalink

import crawler
import os
import utils

TOKEN = 'YOUR_TOKEN_HERE'
DATASET_DIR = 'YOUR_PATH_HERE'
REPO_PATH = 'repos.txt'

utils.removeEmptyDirectories(DATASET_DIR)

c = crawler.GithubCrawler(TOKEN)

if not os.path.exists(REPO_PATH):
    repos = c.search_repo('MNIST+language:python', 1000, 2000)
    f = open(REPO_PATH, 'w')
    for r in repos:
        f.write(r + '\n')
    f.close()
else:
    f = open(REPO_PATH, 'r')
    repos = f.readlines()
    f.close()

S = 0
L = len(repos)
print("Found repositories:", L)

for i in range(S, L):
    r = repos[i].strip()
    savename = r.replace('/', '_')
    print('Downloading', f'[{i}] :', savename)

    if os.path.exists(os.path.join(DATASET_DIR, savename)):
        continue

    files = c.search_files(r, True)
    files = list(filter(lambda x : utils.isformat(x, ['py', 'ipynb']), files))
    if len(files) > 0:
        utils.downloadFiles(DATASET_DIR, savename, files)