How can I programmatically use google docs to programmatically extract text from pdf files?. I all ready know that there are other options, however, I am curious about if it's possible to use google docs for such purposes.
When python is used to retrieve PDF data to text data, you can achieve it using Drive API v3. But 2 steps are required for it.
In this sample, Python Quickstart is used. The detail information is https://developers.google.com/drive/v3/web/quickstart/python. Please read "Step 1: Turn on the Drive API" and "Step 2: Install the Google Client Library". If you have already known them, I'm sorry.
When you use following sample script, please modify as follows.
Please add following imports to Quickstart.
import io
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
Please change SCOPES to below.
SCOPES = 'https://www.googleapis.com/auth/drive'
main()
Please change main()
of Quickstart to this.
Sample script can be convert PDF file to TXT file. But images in the PDF file cannot be TXT file.
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = 'sample.pdf' # PDF file
txtfile = 'sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()
If I misunderstand your question, I'm sorry.
Script added Quickstart :
from __future__ import print_function
import httplib2
import os
import io
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
credential_path = os.path.join("./", 'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
pdffile = '../Downloads/sample.pdf' # PDF file
txtfile = '../Downloads/sample.txt' # Text file
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': pdffile,
'mimeType': mime
},
media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
).execute()
dl = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = dl.next_chunk()
print("Done.")
if __name__ == '__main__':
main()