Search code examples
pythontornado

How to get url that were timeouted or got error?


I have python class on python+tornado, that works like crawler. I have lot of links on the same site and i need to got responses from all of them to my data base. So difficult in this: I cant understand how can i catch urls, that got error(timeout, or runtime exeptions). I know how to fix this with newbie-code(i've just 1 week code on python) - compare list of input links and output, but i want to do right way. Can u tell me how can i do this?

import time
import requests
import json
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue

class Scraper():
    def __init__(self, source='', destinations=None, transform=None, headers={ }, max_clients=20, maxsize=20, connect_timeout=600, request_timeout=600 ):
        """Instantiate a tornado async http client to do many URL requests"""

        if None in destinations:
            sys.stderr.write('You must pass both collection of URLS and a transform function')
            raise SystemExit
        self.max_clients = max_clients
        self.maxsize = maxsize
        self.connect_timeout = connect_timeout
        self.request_timeout = request_timeout
        # AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=50)
        AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
        self.headers = headers
        self.http_client = AsyncHTTPClient()
        self.queue = Queue(maxsize=20)
        self.source = source
        self.destinations = destinations
        self.transform = transform
        self.read(self.destinations)
        self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout)
        self.loop = ioloop.IOLoop.current()
        self.join_future = self.queue.join()

        def done(future):
            self.loop.stop()

        self.join_future.add_done_callback(done)
        self.loop.start()

    @gen.coroutine
    def read(self, destinations):
        for url in destinations:
            yield self.queue.put(url)

    @gen.coroutine
    def get(self, transform, headers, connect_timeout, request_timeout):
        while True:
            url = yield self.queue.get()
            request = HTTPRequest(url,
                                connect_timeout=connect_timeout,
                                request_timeout=request_timeout,
                                method="GET",
                                headers = headers
                                )

            future = self.http_client.fetch(request)
            def done_callback(future):
                self.queue.task_done()
                body = future.result().body
                transform(body)


            future.add_done_callback(done_callback)

def transform_data(body, url=''):

    #SOMECODE

a = ['link1', 'link2']
scraper = Scraper(destinations=a, transform=transform_data)

Solution

  • In a coroutine you can "yield" a future. The coroutine pauses until the future is resolved into a result or an exception:

    try:
        result = yield self.http_client.fetch(request)
    except Exception as exc:
        print("Failure!: %s" % exc)
    else:
        self.queue.task_done() 
        body = result.body
        transform(body)
    

    For more examples, see the Tornado documentation for HTTP clients.