Am trying to download multiple PDF files from Azure and combine them (using PyPDF2 library) all into one PDF for re-upload into azure.
Am currently getting an error of PyPDF2.utils.PdfReadError: Unsupported PNG filter 4
on line pdf = PyPDF2.PdfFileReader(output)
.
consolidated_pdf = review_level_str.title() + '.pdf'
merger = PyPDF2.PdfFileMerger()
for each_file in filename_lst:
blob_client = blob_service.get_blob_client(container=f'{flask_env}-downloads', blob=each_file)
blob_object = blob_client.download_blob()
bytes_file = blob_object.readall()
output = io.BytesIO()
output.write(bytes_file)
pdf = PyPDF2.PdfFileReader(output)
merger.append(pdf)
blob_client_pdf = blob_service.get_blob_client(container=f'{flask_env}-downloads', blob=consolidated_pdf)
blob_client_pdf.upload_blob(pdf.getvalue())
Try this:
from azure.storage.blob import ContainerClient
from PyPDF2 import PdfFileMerger
import shutil,os
pdf_list = ['test1.pdf','test2.pdf']
container = 'pdf'
storage_conn_str = ''
tempPath = 'd:/home/temp2/'
os.mkdir(tempPath)
mergedObject = PdfFileMerger()
ContainerClient = ContainerClient.from_connection_string(storage_conn_str,container)
for pdf in pdf_list:
localPdfPath = tempPath + pdf
with open(localPdfPath, "wb") as download_file:
download_file.write(ContainerClient.download_blob(pdf).readall())
mergedObject.append(localPdfPath)
mergedPDFPath = tempPath + 'merged.pdf'
mergedObject.write(mergedPDFPath)
mergedObject.close()
with open(mergedPDFPath, "rb") as stream:
ContainerClient.upload_blob('merged.pdf',stream, overwrite=True)
#remove all temp files after upload.
shutil.rmtree(tempPath)
Check the merged.pdf
:
Let me know if you have any more questions.