I coded a small Flask app to download files from Google Drive.
@app.route("/downloadFile/<id>")
def downloadFile(id):
ioBytes, name, mime = gdrive.downloadFile(id)
return send_file(ioBytes, mime, True, name)
I used the download method from the example here, with small changes
def downloadFile(self, file_id):
file = self.drive.files().get(fileId=file_id).execute()
request = self.drive.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print("Downloading {} - {}%".format(file.get('name'), int(status.progress() * 100)))
fh.seek(0)
return (fh, file.get('name'), file.get('mimeType'))
It worked as expected and downloaded the file to my computer.
Now, I want to deploy this Flask app to Heroku. My problem is with the HTTP timeouts, as stated here:
HTTP requests have an initial 30 second window in which the web process must return response data
As some of my files might take more than 30 seconds to download, this ends up being a big problem.
I've tried to use the Response class and the yield statement to keep sending empty bytes until I have downloaded and sent the file with the below function:
def sendUntilEndOfRequest(func, args=()):
def thread():
with app.app_context(), app.test_request_context():
return func(*args)
with concurrent.futures.ThreadPoolExecutor() as executor:
ret = ""
def exec():
while ret == "":
yield ""
time.sleep(1)
yield ret
future = executor.submit(thread)
def getValue():
nonlocal ret
ret = future.result()
threading.Thread(target=getValue).start()
return Response(stream_with_context(exec()))
I tried to make it somewhat generic so that if I have any other function that take more than 30 seconds to execute, I can use it.
Now, my download code is
@app.route("/downloadFile/<id>")
def downloadFile(id):
def downloadAndSendFile():
ioBytes, name, mime = gdrive.downloadFile(id)
return send_file(ioBytes, mime, True, name)
return sendUntilEndOfRequest(downloadAndSendFile)
But everytime I try to run this code, it gives this error:
127.0.0.1 - - [15/Jan/2020 20:38:06] "[37mGET /downloadFile/1heeoEBZrhW0crgDSLbhLpcyMfvXqSmqi HTTP/1.1[0m" 200 -
Error on request:
Traceback (most recent call last):
File "C:\Users\fsvic\AppData\Local\Programs\Python\Python37\lib\site-packages\werkzeug\serving.py", line 303, in run_wsgi
execute(self.server.app)
File "C:\Users\fsvic\AppData\Local\Programs\Python\Python37\lib\site-packages\werkzeug\serving.py", line 294, in execute
write(data)
File "C:\Users\fsvic\AppData\Local\Programs\Python\Python37\lib\site-packages\werkzeug\serving.py", line 274, in write
assert isinstance(data, bytes), "applications must write bytes"
AssertionError: applications must write bytes
Apparently, the file downloads correctly. I tested replacing the send_file
with the render_template
command to check if yielding flask objects is possible and it worked perfectly. I also tested returning strings and it worked as well.
In the end, how can I retrive the file I downloaded?
All MediaIoBaseDownload
does is calling the write
method of the file handler.
So you can implement your own IO like this:
import io
from googleapiclient import discovery
from httplib2 import Http
from oauth2client import file, client, tools
from googleapiclient.http import MediaIoBaseDownload
from flask import Flask
from flask import Response
app = Flask(__name__)
SCOPES = 'https://www.googleapis.com/auth/drive.readonly'
store = file.Storage('storage.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_id.json', SCOPES)
creds = tools.run_flow(flow, store)
drive_service = discovery.build('drive', 'v3', http=creds.authorize(Http()))
class ChunkHolder(object):
def __init__(self):
self.chunk = None
def write(self, chunk):
"""Save current chunk"""
self.chunk = chunk
@app.route('/<file_id>')
def download_file(file_id):
request = drive_service.files().get_media(fileId=file_id)
def download_stream():
done = False
fh = ChunkHolder()
downloader = MediaIoBaseDownload(fh, request)
while not done:
status, done = downloader.next_chunk()
print("Download %d%%." % int(status.progress() * 100))
yield fh.chunk
return Response(download_stream())
if __name__ == '__main__':
app.run(port=5000)
We yield the downloaded chunks as soon as they are downloaded and do not retain previous chunks in memory.