Search code examples
pythonpdfpypdf

merging pdf files with pypdf


I am writing a script that parses an internet site (maya.tase.co.il) for links, downloads pdf file and merges them. It works mostly, but merging gives me different kinds of errors depending on the file. I cant seem to figure out why. I cut out the relevant code and built a test only for two specific files that are causing a problem. The script uses pypdf, but I am willing to try anything that works. Some files are encrypted, some are not.

def is_incry(pdf):
    from pyPdf import PdfFileWriter, PdfFileReader
    input=PdfFileReader(pdf)
    try:
        input.getNumPages()
        return input
    except:
        input.decrypt("")
        return input

def merg_pdf(to_keep,to_lose):
    import os
    from pyPdf import PdfFileWriter, PdfFileReader
    if os.path.exists(to_keep):
        in1=file(to_keep, "rb")
        in2=file(to_lose, "rb")
        input1 = is_incry(in1)
        input2 = is_incry(in2)
        output = PdfFileWriter()
        loop1=input1.getNumPages()
        for i in range(0,loop1):
            output.addPage(input1.getPage(i))#            
        loop2=input2.getNumPages()
        for i in range(0,loop2):
            output.addPage(input2.getPage(i))#
        outputStream = file("document-output.pdf", "wb")
        output.write(outputStream)
        outputStream.close()
        pdflen=loop1+loop2
        in1.close()
        in2.close()
        os.remove(to_lose)
        os.remove(to_keep)
        os.rename("document-output.pdf",to_keep)
    else:
        os.rename(to_lose,to_keep)
        in1=file(to_keep, "rb")
        input1 = PdfFileReader(in1)
        try:
            pdflen=input1.getNumPages()
        except:
            input1.decrypt("")
            pdflen=input1.getNumPages()
        in1.close()
        #input1.close()
    return pdflen


def test():
    import urllib
    urllib.urlretrieve ('http://mayafiles.tase.co.il/RPdf/487001-488000/P487028-01.pdf', 'temp1.pdf')
    urllib.urlretrieve ('http://mayafiles.tase.co.il/RPdf/488001-489000/P488170-00.pdf', 'temp2.pdf')
    merg_pdf('temp1.pdf','temp2.pdf')

test()

I thank anyone that even took the time to read this. Al.


Solution

  • I once wrote a complex PDF generation/merging stuff which I have now open-sourced.

    You can have a look at it: https://github.com/becomingGuru/nikecup/blob/master/reg/models.py#L71

    def merge_pdf(self):
        from pyPdf import PdfFileReader,PdfFileWriter
    
        pdf_file = file_names['main_pdf']%settings.MEDIA_ROOT
        pdf_obj = PdfFileReader(open(pdf_file))
    
        values_page = PdfFileReader(open(self.make_pdf())).getPage(0)
    
        mergepage = pdf_obj.pages[0]
        mergepage.mergePage(values_page)
    
        signed_pdf = PdfFileWriter()
        for page in pdf_obj.pages:
            signed_pdf.addPage(page)
    
        signed_pdf_name = file_names['dl_done']%(settings.MEDIA_ROOT,self.phash)
        signed_pdf_file = open(signed_pdf_name,mode='wb')
    
        signed_pdf.write(signed_pdf_file)
        signed_pdf_file.close()
        return signed_pdf_name
    

    It then works like a charm. Hope it helps.