Search code examples
pythonpdfamazon-s3aws-lambdapymupdf

Saving a pymupdf fitz object to s3 as a pdf


I am trying to crop a pdf and save it to s3 with same name using lambda. I am getting error on the data type being a fitz.fitz.page

import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO

OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]


def lambda_handler(event, context):

    textract = boto3.client("textract")
    if event:
        file_obj = event["Records"][0]
        bucketname = str(file_obj["s3"]["bucket"]["name"])
        filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
        
        doc = fitz.open()
        s3 = boto3.resource('s3')
        obj = s3.Object(bucketname, filename)
        fs = obj.get()['Body'].read()
        pdf=fitz.open("pdf", stream=BytesIO(fs))
        #pdf.close()
        
        
        rect=fitz.Rect(0.0, 0.0, 595.0, 842.0)
        #page = pdf[0]
        page1 = doc.new_page(width = rect.width,  # new page with ...
                       height = rect.height)
        page1.show_pdf_page(rect, pdf, 0)  
        print(type(doc))
        print(type(page1))
        
        s3.Bucket(bucketname).put_object(Key=filename, Body=page1)


Solution

  • This is happening because the page1 object is defined using fitz.fitz.page and the type expected by S3 put object is bytes.

    In order to solve the issue, you can use the write function of the new PDF (doc) and get the output of it which is in bytes format that you could pass to S3 then.

    # Save fil first.
    new_bytes = doc.write()
    s3.Bucket(bucketname).put_object(Key=filename, Body=new_bytes)