from github import Github, Auth
import typing as T
from langchain.docstore.document import Document
def load_github_repos():
def clone_github_repo(org_name, repo_name, files:T.Tuple=(".md", ".txt")) -> T.List[Document]:
auth = Auth.Token(GIT_TOKEN)
g = Github(auth=auth)
repo = g.get_repo(f"{org_name}/{repo_name}")
contents = repo.get_contents("")
docs = []
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))
else:
if file_content.path.endswith(files):
docs.append(Document(page_content=file_content.decoded_content, metadata={"filename":file_content.name}))
return docs
repo_docs = []
for repo in REPOS:
repo_docs += clone_github_repo(repo_name=repo)
Here's the stack trace:
Traceback (most recent call last):
File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 174, in <module>
update_vector_database()
File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 115, in update_vector_database
load_github_repos())
File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 63, in load_github_repos
repo_docs += clone_github_repo(repo_name=repo)
File "/Users/john.eastman/workspace/rfp-monster/data_scripts/update_db.py", line 56, in clone_github_repo
contents.extend(repo.get_contents(file_content.path))
File "/Users/john.eastman/workspace/venv/lib/python3.9/site-packages/github/Repository.py", line 2107, in get_contents
headers, data = self._requester.requestJsonAndCheck(
File "/Users/john.eastman/workspace/venv/lib/python3.9/site-packages/github/Requester.py", line 442, in requestJsonAndCheck
return self.__check(
File "/Users/john.eastman/workspace/venv/lib/python3.9/site-packages/github/Requester.py", line 487, in __check
raise self.__createException(status, responseHeaders, data)
github.GithubException.RateLimitExceededException: 403 {"message": "API rate limit exceeded for user ID 80288341.", "documentation_url": "https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}
My question is what should I do to avoid this rate limiting? I thought I was only making an api call for each REPO that I get with g.get_repo(), but the rate limiting page says I'm going above 5,000 API requests in an hour (at least those are the numbers I think making me crash).
You could catch the rate Exception and back off
for repo in repos:
try:
repo_docs += clone_github_repo(repo_name=repo)
except github.GithubException.RateLimitExceededException:
time.sleep(600) # adjust to taste
# let trying again fail
repo_docs += clone_github_repo(repo_name=repo)
Otherwise you could check the Rate Limit Headers from the link in the response
https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limit-headers
Finally, you might not need so many repositories all at once