I am using PyGithub to scrape some repositories, though I get some error while iterating through the search pages.
def scrape_interval(self, interval):
for repo_number, repo in self.search(interval):
code...
def search(self, interval):
try:
iterator = enumerate(self.github.search_repositories(query="Laravel created:" + interval))
except:
print.warning("Going to sleep for 1 hour. The search API hit the limit")
time.sleep(3600)
iterator = self.search(interval)
return iterator
As you can see I try to catch errors when creating the iterator in def search
. But the error is thrown on line for repo_number, repo in self.search(interval):
so that is at some point when the iterator get next item?
What are my options to make these errors catchable? I would preferably avoid wrapping the whole for loop in a try clause, rather manage it during the iteration itself.
For reference about the error in itself:
File "/Users/olofjondelius/Documents/Code/laravel-ai/src/examples/migration-analysis/../../GithubScraper.py", line 47, in scrape_interval
for repo_number, repo in self.search(interval):
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/PaginatedList.py", line 58, in _iter_
newElements = self._grow()
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/PaginatedList.py", line 70, in _grow
newElements = self._fetchNextPage()
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/PaginatedList.py", line 172, in _fetchNextPage
headers=self.__headers
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/Requester.py", line 185, in requestJsonAndCheck
return self.__check(*self.requestJson(verb, url, parameters, headers, input, cnx))
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/Requester.py", line 231, in requestJson
return self.__requestEncode(cnx, verb, url, parameters, headers, input, encode)
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/Requester.py", line 284, in __requestEncode
status, responseHeaders, output = self.__requestRaw(cnx, verb, url, requestHeaders, encoded_input)
File "/anaconda3/envs/laravel-ai/lib/python3.7/site-packages/github/Requester.py", line 309, in __requestRaw
requestHeaders
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 1229, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 1275, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 1224, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 1016, in _send_output
self.send(msg)
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 956, in send
self.connect()
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 1384, in connect
super().connect()
File "/anaconda3/envs/laravel-ai/lib/python3.7/http/client.py", line 928, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/anaconda3/envs/laravel-ai/lib/python3.7/socket.py", line 707, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "/anaconda3/envs/laravel-ai/lib/python3.7/socket.py", line 748, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known
It sounds like the exception is being raised when you are iterating over the iterator, rather than when you create it. Your current try
and except
blocks only catch exceptions that are raised immediately when you call self.github.search_repositories
, not anything that comes up while you're consuming the results.
To work around that, you could make your search
function a generator. That would let you yield values for as long as you have them, but still catch exceptions and retry as often as necessary.
Try something like this:
def search(self, interval):
while True:
try:
it = enumerate(self.github.search_repositories(query="Laravel created:" + interval))
yield from it
return # if we completed the yield from without an exception, we're done!
except: # you should probably limit this to catching a specific exception types
print.warning("Going to sleep for 1 hour. The search API hit the limit")
time.sleep(3600)
As I noted in a comment, you should probably change the bare except
statement to except socket.gaierror
or something similar, so that you don't suppress all exceptions but rather only the ones you are expecting and that a delay will fix for you. Something genuinely unexpected should still be allowed to stop the program (since it may reflect a bug elsewhere in your code).