I am having problems to copy a table cell from a DOCX file to a new, empty document with all relevant styles and images
The styling should include bullet points or numbered lists, too. Also, the autoformatted arrows (==> or -->) also seem to be missing entirely.
This is my starting point:
from docx import Document
def copy_cell_content(cell, outfile):
"""
Copy formatted cell to DOCX outfile.
Parameters
----------
* cell : docx.table._Cell
* outfile : str output file path
Returns
-------
None.
"""
## Create a new document for output
output_doc = Document()
## Iterate over paragraphs of the cell
for paragraph in cell.paragraphs:
## Create a new paragraph
new_paragraph = output_doc.add_paragraph()
## Copy paragraph alignment
new_paragraph.alignment = paragraph.alignment
## Copy paragraph style
# new_paragraph.style = paragraph.style
## Iterate over runs of the paragraph
for run in paragraph.runs:
## Copy text
new_run = new_paragraph.add_run(run.text)
# Copy font data
new_run.style.name = run.style.name
## Copy formatting
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.size = run.font.size
new_run.font.name = run.font.name
new_run.font.underline = run.font.underline
new_run.font.subscript = run.font.subscript
new_run.font.superscript = run.font.superscript
new_run.font.color.rgb = run.font.color.rgb
new_run.font.small_caps = run.font.small_caps
## TODO: copy list styles
## TODO: copy inline images
Unfortunately, I have not fully achieved what I hoped for, but I have made some progress. Best results are achieved with a "raw" copy of entire paragraphs. Images are still not copied, though, only a broken placeholder. Styling is incomplete, but good enough.
This raw copying works fine when the target is a document. However, when a table cell is copied to a new table, this result in a broken document. For this, I must resort to a "by run" approach, which is less complete.
def _copy_cell_raw(source_cell, target, skiplines=0, debug=False):
"""
Copy cell content with formatting to a target cell or paragraph.
--> https://github.com/python-openxml/python-docx/issues/182
--> https://github.com/python-openxml/python-docx/issues/156
--> https://github.com/python-openxml/python-docx/issues/270
--> https://stackoverflow.com/questions/48713465/python-docx-copy-table
--> https://stackoverflow.com/questions/24965042/python-docx-insertion-point
Parameters
----------
* source_cell : docx.table._Cell
* target : docx.table._Cell *or* docx.document.Document
* skiplines : int initial paragraphs to skip
Returns
-------
None.
"""
## TODO: Fix indendation level formatting of bullet points and numbered lists
## TODO: Fix raw table copying
## TODO: Fix broken images
## Iterate over paragraphs of the cell
for i, paragraph in enumerate(source_cell.paragraphs[skiplines:]):
if debug:
print(f'>> {i:3d}/{len(source_cell.paragraphs)} : {paragraph.style.name}/{paragraph.alignment}')
## Copy paragraph from source to target (cell or document)
## TODO: remove leading newline
if type(target) is docx.document.Document:
new_paragraph = target._body._body._insert_p(paragraph._p)
else:
new_paragraph = target._element._insert_p(paragraph._p)
if paragraph._p.get_or_add_pPr().numPr is not None:
new_paragraph.style = paragraph.style.name
## Iterate over tables of the cell
for i, table in enumerate(source_cell.tables):
if debug:
print(f'\n>> {i:3d} : {table}')
## 1) Copy table low-level
## TODO: results in corrupted data when the table is embedded in a table
# new_table = copy.deepcopy(table._tbl)
# new_paragraph = target.add_paragraph()
# new_paragraph._p.addnext(new_table)
## 2) Copy table by runs
## Get the table layout
rows, cols = 0, 0
for row in table.rows:
rows += 1
for col in table.rows[0].cells:
cols += 1
## Create a new paragraph in the target (cell or document)
new_table = target.add_table(rows, cols)
for row in range(rows):
for col in range(cols):
source_cell = table.cell(row, col)
target_cell = new_table.cell(row, col)
_copy_cell_byruns(source_cell, target_cell)
if debug:
print("<< Cell copied!\n")
def _copy_cell_byruns(source_cell, target, skiplines=0, debug=False):
"""
Copy cell content with formatting to a target cell or paragraph.
--> https://groups.google.com/g/python-docx/c/yEuOJRB1dpY
Parameters
----------
* source_cell : docx.table._Cell
* target : docx.table._Cell *or* docx.document.Document
* skiplines : int initial paragraphs to skip
Returns
-------
None.
"""
## TODO: Handle list styles
## TODO: Handle images
## Iterate over paragraphs of the cell
for i, paragraph in enumerate(source_cell.paragraphs[skiplines:]):
if debug:
print(f'\n>> {i:3d} : {paragraph.style.name}/{paragraph.alignment}')
## Create a new paragraph in the target (cell or document)
if i == 0 and type(target) is docx.table._Cell:
new_paragraph = target.paragraphs[-1]
else:
new_paragraph = target.add_paragraph()
## Iterate over runs of the source paragraph
for run in paragraph.runs:
## Copy text
new_run = new_paragraph.add_run(run.text)
## Copy character and font styles
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.size = run.font.size
new_run.font.name = run.font.name
new_run.font.underline = run.font.underline
new_run.font.subscript = run.font.subscript
new_run.font.superscript = run.font.superscript
new_run.font.color.rgb = run.font.color.rgb
new_run.font.small_caps = run.font.small_caps
try:
new_run.style = run.style.name
except KeyError:
pass
if debug:
print(f' > {i:3d} : {run.style.name}')
print(f' < {new_run.style.name}')
## Copy paragraph alignment & style
new_paragraph.alignment = paragraph.alignment
try:
new_paragraph.style = paragraph.style.name
except KeyError:
pass
if debug:
print(f'<< {new_paragraph.style.name}/{new_paragraph.alignment}')
## Iterate over tables of the cell
for i, table in enumerate(source_cell.tables):
if debug:
print(f'\n>> {i:3d} : {table}')
## Get the table layout
rows, cols = 0, 0
for row in table.rows:
rows += 1
for col in table.rows[0].cells:
cols += 1
## Create a new paragraph in the target (cell or document)
new_table = target.add_table(rows, cols)
for row in range(rows):
for col in range(cols):
source_cell = table.cell(row, col)
target_cell = new_table.cell(row, col)
_copy_cell_byruns(source_cell, target_cell)
if debug:
print("<< Cell copied!\n")