crawler.py 2.31 KB
from github import Github
import time
import calendar

DATASET_MAX = 1000

class GithubCrawler:
    def __init__(self, token):
        self._token = token
        self._g = Github(token)

    def getTimeLimit(self):
        core_rate_limit = self._g.get_rate_limit().core
        reset_timestamp = calendar.timegm(core_rate_limit.reset.timetuple())
        sleep_time = reset_timestamp - calendar.timegm(time.gmtime()) + 1
        return sleep_time

    def search_repo(self, keywords, S = 0, E = DATASET_MAX):
        if type(keywords) == str:
            keywords = [keywords] #auto packing for one keyword
        
        query = '+'.join(keywords) + '+in:readme+in:description'
        result = self._g.search_repositories(query)

        ret = []
        for i in range(S, E):
            while True:
                try:
                    r = result[i]
                    repoName = r.owner.login+'/'+r.name
                    print("repo found", f"[{i}]:", repoName)
                    ret.append(repoName)
                    break
                except Exception:
                    print("Rate Limit Exceeded... Retrying", f"{[i]}", "Limit Time:", self.getTimeLimit())
                    time.sleep(1)
        
        return ret

    def search_files(self, repo_url, downloadLink = False):
        while True:
            try:
                repo = self._g.get_repo(repo_url)
                break
            except Exception as e:
                if '403' in str(e):
                    print("Rate Limit Exceeded... Retrying", f"{[i]}", "Limit Time:", self.getTimeLimit())
                    time.sleep(1)
                    continue
                print(e)
                return []

        try:
            contents = repo.get_contents("")
        except Exception: #empty repo
            return []
            
        files = []

        while contents:
            file_content = contents.pop(0)
            if file_content.type == 'dir':
                if 'lib' in file_content.path: #python lib is in repo (too many files)
                    return []
                contents.extend(repo.get_contents(file_content.path))
            else:
                if downloadLink:
                    files.append(file_content.download_url)
                else:
                    files.append(file_content.path)

        return files