Search code examples
pythonpdfpython-docx

Get starting (and ending) page of an MS Word paragraph using python-docx


I'm automating the creation of an MS Word document. When it's complete, I need to be able to save it as a PDF and insert some other/external PDF pages into the PDF version of the Word document. To do this I was planning on leaving markers (e.g. "[pdfGoesHere]") in the Word document on their own pages.

To insert/replace the new PDF pages, I need to know what pages the markers are on. Does python-docx have a way to determine what page number a paragraph starts (and ends) on? I've read through the python-docx documentation and there doesn't seem to be anything for this. I know I can cycle through all of the paragraphs and find the paragraph(s) I'm interested in, but I can't find a deterministic way to get the paragraph's page number.

Is there a way to do this I have overlooked? If not are there other suggestions for how to accomplish the main goal of inserting PDF pages?


Solution

  • I appreciate the feedback given by @scanny. Since there isn't a way to do this in python-docx, and I'm turning the document into a PDF anyway, I decided to use pdfminer to get the page numbers after the Word document was converted into a PDF. This code may be long, but it gets the job done

    import re
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    def xmlToLines(xml):
        text = ''.join(xml)
        return text.split('\n')
    
    #Convert a PDF found at the 'path' and turns it into XML lines
    #path is the full path directory to the PDF file you're reading from 
    def convert_pdf_to_xml(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
    
        print 'Converting following file from PDF to XML: \n - ' + str(path)
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
    
        text = retstr.getvalue()
        lines = xmlToLines(text)
    
        #Close out pdf and I/O's
        fp.close()
        device.close()
        retstr.close()
    
        return lines
    
    #returns a list of every page number where the field marker is found in the PDF
    def getPagesWithField(wordPdfPath, field):
        lines = convert_pdf_to_xml(wordPdfPath)
        page_regex = r'page id="[0-9]*"'
        t_regex = r'<text font='
        pagesFound = []
        text = ''
        field = field.replace('<','&').replace('>','&')
        for i in range(len(lines)):
            #If it's a new page line, increment to the new page
            if len(re.findall(page_regex, lines[i])) > 0:
                page = int(re.findall(r'[0-9]{1,}', lines[i])[0])
                #print 'page: ' + str(page)
            #If it's the end of a line
            elif lines[i] == '<text>':
                #print "Text: " + text
                #check if the collected text is the field you're looking for 
                if field in text:
                    pagesFound.append(page)
                text = ''
            #If it's a line with a text character, add it to text
            elif len(re.findall(t_regex, lines[i])) > 0:
                text = str(text + re.findall(r'>[^\r\n]*</text>',lines[i])[0][1])
    
        pagesFound = list(set(pagesFound))
        pagesFound.sort()       
        return pagesFound
    

    After this, PyPDF2 can be used for simple PDF page insertion/merging