Writing a Twisted Client to send looping GET request to multiple API calls and record response

I haven't done twisted programming in a while so I'm trying to get back into it for a new project. I'm attempting to set up a twisted client that can take a list of servers as an argument, and for each server it sends an API GET call and writes the return message to a file. This API GET call should be repeated every 60 seconds.

I've done it successfully with a single server using Twisted's agent class:

from StringIO import StringIO

from twisted.internet import reactor
from twisted.internet.protocol import Protocol
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
from twisted.internet.defer import Deferred

import datetime
from datetime import timedelta
import time

count = 1
filename = "test.csv"

class server_response(Protocol):
    def __init__(self, finished):
        print "init server response"
        self.finished = finished
        self.remaining = 1024 * 10

    def dataReceived(self, bytes):
        if self.remaining:
            display = bytes[:self.remaining]
            print 'Some data received:'
            print display
            with open(filename, "a") as myfile:
                myfile.write(display)

            self.remaining -= len(display)


    def connectionLost(self, reason):
        print 'Finished receiving body:', reason.getErrorMessage()

        self.finished.callback(None)

def capture_response(response): 
    print "Capturing response"
    finished = Deferred()
    response.deliverBody(server_response(finished))
    print "Done capturing:", finished

    return finished

def responseFail(err):
    print "error" + err
    reactor.stop()


def cl(ignored):
    print "sending req"
    agent = Agent(reactor)
    headers = {
    'authorization': [<snipped>],
    'cache-control': [<snipped>],
    'postman-token': [<snipped>]
    }

    URL = <snipped>
    print URL

    a = agent.request(
        'GET',
        URL,
        Headers(headers),
        None)

    a.addCallback(capture_response)
    reactor.callLater(60, cl, None)
    #a.addBoth(cbShutdown, count)


def cbShutdown(ignored, count):
    print "reactor stop"
    reactor.stop()

def parse_args():
    usage = """usage: %prog [options] [hostname]:port ...
    Run it like this:
      python test.py hostname1:instanceName1 hostname2:instancename2 ...
    """

    parser = optparse.OptionParser(usage)

    _, addresses = parser.parse_args()

    if not addresses:
        print parser.format_help()
        parser.exit()

    def parse_address(addr):
        if ':' not in addr:
            hostName = '127.0.0.1'
            instanceName = addr
        else:
            hostName, instanceName = addr.split(':', 1)

        return hostName, instanceName

    return map(parse_address, addresses)

if __name__ == '__main__':
    d = Deferred()
    d.addCallbacks(cl, responseFail)
    reactor.callWhenRunning(d.callback, None)

    reactor.run()

However I'm having a tough time figuring out how to have multiple agents sending calls. With this, I'm relying on the end of the write in cl() ---reactor.callLater(60, cl, None) to create the call loop. So how do I create multiple call agent protocols (server_response(Protocol)) and continue to loop through the GET for each of them once my reactor is started?

Solution

Look what the cat dragged in!

So how do I create multiple call agent

Use treq. You rarely want to get tangled up with the Agent class.

This API GET call should be repeated every 60 seconds

Use LoopingCalls instead of callLater, in this case it's easier and you'll run into less problems later.

import treq
from twisted.internet import task, reactor

filename = 'test.csv'

def writeToFile(content):
    with open(filename, 'ab') as f:
        f.write(content)

def everyMinute(*urls):
    for url in urls:
        d = treq.get(url)
        d.addCallback(treq.content)
        d.addCallback(writeToFile)

#----- Main -----#            
sites = [
    'https://www.google.com',
    'https://www.amazon.com',
    'https://www.facebook.com']

repeating = task.LoopingCall(everyMinute, *sites)
repeating.start(60)

reactor.run()

It starts in the everyMinute() function, which runs every 60 seconds. Within that function, each endpoint is queried and once the contents of the response becomes available, the treq.content function takes the response and returns the contents. Finally the contents are written to a file.

Are you scraping or trying to extract something from those sites? If you are scrapy might be a good option for you.