Search code examples
pythongithub-api

Is there a way to avoid API rate limiting from crashing my program with Github?


from github import Github, Auth
import typing as T
from langchain.docstore.document import Document


def load_github_repos():
    def clone_github_repo(org_name, repo_name, files:T.Tuple=(".md", ".txt")) -> T.List[Document]:
        auth = Auth.Token(GIT_TOKEN)
        g = Github(auth=auth)
        repo = g.get_repo(f"{org_name}/{repo_name}")
        contents = repo.get_contents("")
        docs = []
        while contents:
            file_content = contents.pop(0)
            if file_content.type == "dir":
                contents.extend(repo.get_contents(file_content.path))
            else:
                if file_content.path.endswith(files):
                    docs.append(Document(page_content=file_content.decoded_content, metadata={"filename":file_content.name}))
        return docs
    repo_docs = []
    for repo in REPOS:
        repo_docs += clone_github_repo(repo_name=repo)

Here's the stack trace:

Traceback (most recent call last):
  File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 174, in <module>
    update_vector_database()
  File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 115, in update_vector_database
    load_github_repos())
  File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 63, in load_github_repos
    repo_docs += clone_github_repo(repo_name=repo)
  File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 56, in clone_github_repo
    contents.extend(repo.get_contents(file_content.path))
  File "/Users/john.eastman/workspace/venv/lib/python3.9/site-packages/github/Repository.py", line 2107, in get_contents
    headers, data = self._requester.requestJsonAndCheck(
  File "/Users/john.eastman/workspace/venv/lib/python3.9/site-packages/github/Requester.py", line 442, in requestJsonAndCheck
    return self.__check(
  File "/Users/john.eastman/workspace/venv/lib/python3.9/site-packages/github/Requester.py", line 487, in __check
    raise self.__createException(status, responseHeaders, data)
github.GithubException.RateLimitExceededException: 403 {"message": "API rate limit exceeded for user ID 80288341.", "documentation_url": "https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}

My question is what should I do to avoid this rate limiting? I thought I was only making an api call for each REPO that I get with g.get_repo(), but the rate limiting page says I'm going above 5,000 API requests in an hour (at least those are the numbers I think making me crash).


Solution

  • You could catch the rate Exception and back off

    for repo in repos:
        try:
            repo_docs += clone_github_repo(repo_name=repo)
        except github.GithubException.RateLimitExceededException:
            time.sleep(600)  # adjust to taste
            # let trying again fail
            repo_docs += clone_github_repo(repo_name=repo)
    

    Otherwise you could check the Rate Limit Headers from the link in the response

    https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limit-headers

    Finally, you might not need so many repositories all at once