crawler.py
2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from github import Github
import time
import calendar
DATASET_MAX = 1000
class GithubCrawler:
def __init__(self, token):
self._token = token
self._g = Github(token)
def getTimeLimit(self):
core_rate_limit = self._g.get_rate_limit().core
reset_timestamp = calendar.timegm(core_rate_limit.reset.timetuple())
sleep_time = reset_timestamp - calendar.timegm(time.gmtime()) + 1
return sleep_time
def search_repo(self, keywords, S = 0, E = DATASET_MAX):
if type(keywords) == str:
keywords = [keywords] #auto packing for one keyword
query = '+'.join(keywords) + '+in:readme+in:description'
result = self._g.search_repositories(query)
ret = []
for i in range(S, E):
while True:
try:
r = result[i]
repoName = r.owner.login+'/'+r.name
print("repo found", f"[{i}]:", repoName)
ret.append(repoName)
break
except Exception:
print("Rate Limit Exceeded... Retrying", f"{[i]}", "Limit Time:", self.getTimeLimit())
time.sleep(1)
return ret
def search_files(self, repo_url, downloadLink = False):
while True:
try:
repo = self._g.get_repo(repo_url)
break
except Exception as e:
if '403' in str(e):
print("Rate Limit Exceeded... Retrying", f"{[i]}", "Limit Time:", self.getTimeLimit())
time.sleep(1)
continue
print(e)
return []
try:
contents = repo.get_contents("")
except Exception: #empty repo
return []
files = []
while contents:
file_content = contents.pop(0)
if file_content.type == 'dir':
if 'lib' in file_content.path: #python lib is in repo (too many files)
return []
contents.extend(repo.get_contents(file_content.path))
else:
if downloadLink:
files.append(file_content.download_url)
else:
files.append(file_content.path)
return files