Search code examples
pythonpython-docx

Copy a DOCX table cell with styling and images to a new document


I am having problems to copy a table cell from a DOCX file to a new, empty document with all relevant styles and images

The styling should include bullet points or numbered lists, too. Also, the autoformatted arrows (==> or -->) also seem to be missing entirely.

This is my starting point:

from docx import Document

def copy_cell_content(cell, outfile):
    """
    Copy formatted cell to DOCX outfile.
    
    Parameters
    ----------
    * cell : docx.table._Cell
    * outfile : str output file path

    Returns
    -------
    None.
    """
    ## Create a new document for output
    output_doc = Document()

    ## Iterate over paragraphs of the cell
    for paragraph in cell.paragraphs:                
        ## Create a new paragraph
        new_paragraph = output_doc.add_paragraph()
        
        ## Copy paragraph alignment
        new_paragraph.alignment = paragraph.alignment

        ## Copy paragraph style
        # new_paragraph.style = paragraph.style
        
        ## Iterate over runs of the paragraph
        for run in paragraph.runs:            
            ## Copy text
            new_run = new_paragraph.add_run(run.text)

            # Copy font data
            new_run.style.name = run.style.name
            
            ## Copy formatting 
            new_run.bold = run.bold
            new_run.italic = run.italic
            new_run.underline = run.underline
            new_run.font.size = run.font.size
            new_run.font.name = run.font.name
            new_run.font.underline = run.font.underline
            new_run.font.subscript = run.font.subscript
            new_run.font.superscript = run.font.superscript
            new_run.font.color.rgb = run.font.color.rgb 
            new_run.font.small_caps = run.font.small_caps

            ## TODO: copy list styles
            ## TODO: copy inline images

Solution

  • Unfortunately, I have not fully achieved what I hoped for, but I have made some progress. Best results are achieved with a "raw" copy of entire paragraphs. Images are still not copied, though, only a broken placeholder. Styling is incomplete, but good enough.

    This raw copying works fine when the target is a document. However, when a table cell is copied to a new table, this result in a broken document. For this, I must resort to a "by run" approach, which is less complete.

    def _copy_cell_raw(source_cell, target, skiplines=0, debug=False):
        """
        Copy cell content with formatting to a target cell or paragraph.
    
        --> https://github.com/python-openxml/python-docx/issues/182
        --> https://github.com/python-openxml/python-docx/issues/156
        --> https://github.com/python-openxml/python-docx/issues/270
        --> https://stackoverflow.com/questions/48713465/python-docx-copy-table
        --> https://stackoverflow.com/questions/24965042/python-docx-insertion-point
    
        Parameters
        ----------
        * source_cell : docx.table._Cell
        * target : docx.table._Cell *or* docx.document.Document
        * skiplines : int initial paragraphs to skip
    
        Returns
        -------
        None.
        """
        ## TODO: Fix indendation level formatting of bullet points and numbered lists
        ## TODO: Fix raw table copying
        ## TODO: Fix broken images
    
        ## Iterate over paragraphs of the cell
        for i, paragraph in enumerate(source_cell.paragraphs[skiplines:]):
            if debug:
                print(f'>> {i:3d}/{len(source_cell.paragraphs)} : {paragraph.style.name}/{paragraph.alignment}')
    
            ## Copy paragraph from source to target (cell or document)
            ## TODO: remove leading newline
            if type(target) is docx.document.Document:
                new_paragraph = target._body._body._insert_p(paragraph._p)
            else:
                new_paragraph = target._element._insert_p(paragraph._p)
    
            if paragraph._p.get_or_add_pPr().numPr is not None:
                new_paragraph.style = paragraph.style.name
    
        ## Iterate over tables of the cell
        for i, table in enumerate(source_cell.tables):
            if debug:
                print(f'\n>> {i:3d} : {table}')
    
            ## 1) Copy table low-level
            ## TODO: results in corrupted data when the table is embedded in a table
            # new_table = copy.deepcopy(table._tbl)
            # new_paragraph = target.add_paragraph()
            # new_paragraph._p.addnext(new_table)
            ## 2) Copy table by runs
            ## Get the table layout
            rows, cols = 0, 0
            for row in table.rows:
                rows += 1
            for col in table.rows[0].cells:
                cols += 1
            ## Create a new paragraph in the target (cell or document)
            new_table = target.add_table(rows, cols)
            for row in range(rows):
                for col in range(cols):
                    source_cell = table.cell(row, col)
                    target_cell = new_table.cell(row, col)
                    _copy_cell_byruns(source_cell, target_cell)
        if debug:
            print("<< Cell copied!\n")
    
    
    def _copy_cell_byruns(source_cell, target, skiplines=0, debug=False):
        """
        Copy cell content with formatting to a target cell or paragraph.
    
        --> https://groups.google.com/g/python-docx/c/yEuOJRB1dpY
    
        Parameters
        ----------
        * source_cell : docx.table._Cell
        * target : docx.table._Cell *or* docx.document.Document
        * skiplines : int initial paragraphs to skip
    
        Returns
        -------
        None.
        """
        ## TODO: Handle list styles
        ## TODO: Handle images
    
        ## Iterate over paragraphs of the cell
        for i, paragraph in enumerate(source_cell.paragraphs[skiplines:]):
            if debug:
                print(f'\n>> {i:3d} : {paragraph.style.name}/{paragraph.alignment}')
    
            ## Create a new paragraph in the target (cell or document)
            if i == 0 and type(target) is docx.table._Cell:
                new_paragraph = target.paragraphs[-1]
            else:
                new_paragraph = target.add_paragraph()
    
            ## Iterate over runs of the source paragraph
            for run in paragraph.runs:
                ## Copy text
                new_run = new_paragraph.add_run(run.text)
    
                ## Copy character and font styles
                new_run.bold = run.bold
                new_run.italic = run.italic
                new_run.underline = run.underline
                new_run.font.size = run.font.size
                new_run.font.name = run.font.name
                new_run.font.underline = run.font.underline
                new_run.font.subscript = run.font.subscript
                new_run.font.superscript = run.font.superscript
                new_run.font.color.rgb = run.font.color.rgb
                new_run.font.small_caps = run.font.small_caps
                try:
                    new_run.style = run.style.name
                except KeyError:
                    pass
    
                if debug:
                    print(f' > {i:3d} : {run.style.name}')
                    print(f' <       {new_run.style.name}')
    
            ## Copy paragraph alignment & style
            new_paragraph.alignment = paragraph.alignment
            try:
                new_paragraph.style = paragraph.style.name
            except KeyError:
                pass
            if debug:
                print(f'<<       {new_paragraph.style.name}/{new_paragraph.alignment}')
    
        ## Iterate over tables of the cell
        for i, table in enumerate(source_cell.tables):
            if debug:
                print(f'\n>> {i:3d} : {table}')
    
            ## Get the table layout
            rows, cols = 0, 0
            for row in table.rows:
                rows += 1
            for col in table.rows[0].cells:
                cols += 1
            ## Create a new paragraph in the target (cell or document)
            new_table = target.add_table(rows, cols)
            for row in range(rows):
                for col in range(cols):
                    source_cell = table.cell(row, col)
                    target_cell = new_table.cell(row, col)
                    _copy_cell_byruns(source_cell, target_cell)
        if debug:
            print("<< Cell copied!\n")