python streaming tornado chunked pydrive

How to use io module and tornado.write at the same time

There is a huge binary file uploaded to Google Drive. I am developing a tornado-based HTTP proxy server which provides a binary stream of the same huge file. It is natural to let the huge file to be proxied in multiple chunks (Download the contents using PyDrive, upload it with self.write(chunk) or something).

The problem is that there seems to be single choice : googleapiclient.http.MediaIoBaseDownloadto download chunked binary files from Google Drive, but this library only supports FDs or io.Base objects as it's first argument.

https://google.github.io/google-api-python-client/docs/epy/googleapiclient.http.MediaIoBaseDownload-class.html

My code looks something like this:

import tornado.httpserver
import tornado.ioloop
import tornado.web
from googleapiclient.http import MediaIoBaseDownload
import io

class VideoContentHandler(tornado.web.RequestHandler):
    def get(self,googledrive_id):
        googledrive_id = googledrive_id[1:]
        query = "'"+self.application.google_drive_folder_id+"' in parents and trashed=false"
        file_list = self.application.drive.ListFile({'q': query}).GetList()

        # io.FileIO will save the chunks to local file!
        # This is not what I want.
        # Using something different may solve the problem?
        with io.FileIO('/tmp/bigvid-from-pydrive.mp4', mode='wb') as local_file:
            for f in file_list:
                if f['id'] != googledrive_id: continue
                id = f.metadata.get('id')
                request = self.application.drive.auth.service.files().get_media(fileId=id)
                downloader = MediaIoBaseDownload(local_file, request, chunksize=2048*1024)
                done = False

                while done is False:
                    status, done = downloader.next_chunk()
                    # Flush buffer and self.write(chunk)?

def main():
    gauth = GoogleAuth()
    gauth.CommandLineAuth() # Already done
    self.drive = GoogleDrive(gauth)
    self.google_drive_folder_id = '<GOOGLE_DRIVE_FOLDER_ID>'

    app = tornado.web.Application([
        (r"^/videocontents(/.+)?$", handlers.api.VideoContentHandler),
    ])
    http_server = tornado.httpserver.HTTPServer(app)
    http_server.listen(8888)
    tornado.ioloop.IOLoop.instance().start()

if __name__ == "__main__":
    main()

When should I call self.write(chunk)?

Solution

You can use io.BytesIO instead of io.FileIO because it will be faster.

I haven't tested it, but this is how your code would look (read the comments for explanation):

from tornado import gen

# need make your get method a coroutine
@gen.coroutine
def get(self, googledrive_id):
    ...

    # with io.FileIO(...) <<<< YOU DON'T NEED THIS LINE NOW 
    for f in file_list:
        ...

        buffer = io.BytesIO() # create a BytesIO object

        downloader = MediaIoBaseDownload(buffer, request, chunksize=2048*1024)

        # Now is the time to set appropriate headers
        self.set_header('Content-Type', 'video/mp4')
        # if you know the size of the video
        # write the Content-length header
        self.set_header('Content-Length', <size of file in bytes>)
        # if not, it's still ok

        done = False

        while done is False:
            status, done = downloader.next_chunk()

            # at this point, downloader has written 
            # the chunk to buffer
            # we'll read that data and write it to the response
            self.write(buffer.getvalue())

            # now fulsh the data to socket
            yield self.flush()

            # we'll also need to empty the buffer
            # otherwise, it will eat up all the RAM
            buffer.truncate(0)

            # seek to the beginning or else it will mess up
            buffer.seek(0)