graykode

(add) git cloner

1 +# Copyright 2020-present Tae Hwan Jung
2 +#
3 +# Licensed under the Apache License, Version 2.0 (the "License");
4 +# you may not use this file except in compliance with the License.
5 +# You may obtain a copy of the License at
6 +#
7 +# http://www.apache.org/licenses/LICENSE-2.0
8 +#
9 +# Unless required by applicable law or agreed to in writing, software
10 +# distributed under the License is distributed on an "AS IS" BASIS,
11 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 +# See the License for the specific language governing permissions and
13 +# limitations under the License.
14 +
15 +import os
16 +import git
17 +import json
18 +import argparse
19 +from git import Repo
20 +from tqdm import tqdm
21 +from time import sleep
22 +from queue import Queue
23 +from threading import Thread
24 +
25 +class ClonePooler(object):
26 + def __init__(self, total_repos):
27 + self.count = 0
28 + self.total_repos = total_repos
29 + self._queue = Queue()
30 + self.num_worker_threads = args.num_worker_threads
31 + self.repos_dir = args.repos_dir
32 +
33 + for i in range(self.num_worker_threads):
34 + _thread = Thread(target=self._worker)
35 + _thread.daemon = True
36 + _thread.start()
37 +
38 + def _worker(self):
39 + while True:
40 + repos = self._queue.get()
41 + self.do_job(repos)
42 + self._queue.task_done()
43 +
44 + def set_queue(self, repos):
45 + self._queue.put(repos)
46 +
47 + def join_queue(self):
48 + self._queue.join()
49 +
50 + def do_job(self, repo):
51 + try:
52 + Repo.clone_from(
53 + f'https://:@github.com/{repo}.git',
54 + f'{self.repos_dir}/{repo}'
55 + )
56 + sleep(0.1)
57 + self.count += 1
58 + print(f"{self.count}/{self.total_repos} {(self.count/self.total_repos) * 100}")
59 + except git.exc.InvalidGitRepositoryError:
60 + print(f'{repo} is not found.')
61 + except git.exc.GitError as e:
62 + print(e)
63 +
64 +def main(args):
65 +
66 + os.makedirs(args.repos_dir, exist_ok=True)
67 + repos = set()
68 + with open(args.jsonl_file, encoding="utf-8") as f:
69 + for idx, line in enumerate(f):
70 + line = line.strip()
71 + js = json.loads(line)
72 + repos.add(js['repo'])
73 +
74 + pooler = ClonePooler(
75 + total_repos=len(repos)
76 + )
77 + for repo in repos:
78 + pooler.set_queue(repo)
79 + pooler.join_queue()
80 +
81 +if __name__ == '__main__':
82 + parser = argparse.ArgumentParser(description="")
83 + parser.add_argument("--jsonl_file", type=str, required=True,
84 + help="jsonl file path.")
85 + parser.add_argument("--repos_dir", type=str, required=True,
86 + help="directory that all repositories will be downloaded.")
87 + parser.add_argument("--num_worker_threads", type=int, default=16,
88 + help="number of threads in a worker")
89 +
90 + args = parser.parse_args()
91 +
92 + main(args)
...\ No newline at end of file ...\ No newline at end of file