Showing
1 changed file
with
92 additions
and
0 deletions
gitcloner.py
0 → 100644
1 | +# Copyright 2020-present Tae Hwan Jung | ||
2 | +# | ||
3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | ||
4 | +# you may not use this file except in compliance with the License. | ||
5 | +# You may obtain a copy of the License at | ||
6 | +# | ||
7 | +# http://www.apache.org/licenses/LICENSE-2.0 | ||
8 | +# | ||
9 | +# Unless required by applicable law or agreed to in writing, software | ||
10 | +# distributed under the License is distributed on an "AS IS" BASIS, | ||
11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
12 | +# See the License for the specific language governing permissions and | ||
13 | +# limitations under the License. | ||
14 | + | ||
15 | +import os | ||
16 | +import git | ||
17 | +import json | ||
18 | +import argparse | ||
19 | +from git import Repo | ||
20 | +from tqdm import tqdm | ||
21 | +from time import sleep | ||
22 | +from queue import Queue | ||
23 | +from threading import Thread | ||
24 | + | ||
25 | +class ClonePooler(object): | ||
26 | + def __init__(self, total_repos): | ||
27 | + self.count = 0 | ||
28 | + self.total_repos = total_repos | ||
29 | + self._queue = Queue() | ||
30 | + self.num_worker_threads = args.num_worker_threads | ||
31 | + self.repos_dir = args.repos_dir | ||
32 | + | ||
33 | + for i in range(self.num_worker_threads): | ||
34 | + _thread = Thread(target=self._worker) | ||
35 | + _thread.daemon = True | ||
36 | + _thread.start() | ||
37 | + | ||
38 | + def _worker(self): | ||
39 | + while True: | ||
40 | + repos = self._queue.get() | ||
41 | + self.do_job(repos) | ||
42 | + self._queue.task_done() | ||
43 | + | ||
44 | + def set_queue(self, repos): | ||
45 | + self._queue.put(repos) | ||
46 | + | ||
47 | + def join_queue(self): | ||
48 | + self._queue.join() | ||
49 | + | ||
50 | + def do_job(self, repo): | ||
51 | + try: | ||
52 | + Repo.clone_from( | ||
53 | + f'https://:@github.com/{repo}.git', | ||
54 | + f'{self.repos_dir}/{repo}' | ||
55 | + ) | ||
56 | + sleep(0.1) | ||
57 | + self.count += 1 | ||
58 | + print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}") | ||
59 | + except git.exc.InvalidGitRepositoryError: | ||
60 | + print(f'{repo} is not found.') | ||
61 | + except git.exc.GitError as e: | ||
62 | + print(e) | ||
63 | + | ||
64 | +def main(args): | ||
65 | + | ||
66 | + os.makedirs(args.repos_dir, exist_ok=True) | ||
67 | + repos = set() | ||
68 | + with open(args.jsonl_file, encoding="utf-8") as f: | ||
69 | + for idx, line in enumerate(f): | ||
70 | + line = line.strip() | ||
71 | + js = json.loads(line) | ||
72 | + repos.add(js['repo']) | ||
73 | + | ||
74 | + pooler = ClonePooler( | ||
75 | + total_repos=len(repos) | ||
76 | + ) | ||
77 | + for repo in repos: | ||
78 | + pooler.set_queue(repo) | ||
79 | + pooler.join_queue() | ||
80 | + | ||
81 | +if __name__ == '__main__': | ||
82 | + parser = argparse.ArgumentParser(description="") | ||
83 | + parser.add_argument("--jsonl_file", type=str, required=True, | ||
84 | + help="jsonl file path.") | ||
85 | + parser.add_argument("--repos_dir", type=str, required=True, | ||
86 | + help="directory that all repositories will be downloaded.") | ||
87 | + parser.add_argument("--num_worker_threads", type=int, default=16, | ||
88 | + help="number of threads in a worker") | ||
89 | + | ||
90 | + args = parser.parse_args() | ||
91 | + | ||
92 | + main(args) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment