I'm using python-docx to create a document with a table I want to populate from textual data. My text looks like this:
01:02:10.3
a: Lorem ipsum dolor sit amet,
b: consectetur adipiscing elit.
a: Mauris a turpis erat.
01:02:20.4
a: Vivamus dignissim aliquam
b: Nam ultricies
(etc.)
I need to organize it in a table like this (using ASCII for visualization):
+---+--------------------+---------------------------------+
| | A | B |
+---+--------------------+---------------------------------+
| 1 | 01:02:10.3 | a: Lorem ipsum dolor sit amet, |
| 2 | | b: consectetur adipiscing elit. |
| 3 | | a: Mauris a turpis erat. |
| 4 | ------------------ | ------------------------------- |
| 5 | 01:02:20.4 | a: Vivamus dignissim aliqua |
| 6 | | b: Nam ultricies |
+---+--------------------+---------------------------------+
however, I need to make it so everything after "a: " is bold, and everything after "b: " isn't, while they both occupy the same cell. It's pretty easy to iterate and organize this the way I want, but I'm really unsure about how to make only some of the lines bold:
IS_BOLD = {
'a': True
'b': False
}
row_cells = table.add_row().cells
for line in lines:
if is_timestamp(line): # function that uses regex to discern between columns
if row_cells[1]:
row_cells = table.add_row().cells
row_cells[0].text = line
else
row_cells[1].text += line
if IS_BOLD[ line.split(":")[0] ]:
# make only this line within the cell bold, somehow.
(this is sort of pseudo-code, I'm doing some more textual processing but that's kinda irrelevant here). I found one probably relevant question where someone uses something called run
but I'm finding it hard to understand how to apply it to my case.
Any help? Thanks.
You need to add run
in the cell's paragraph. This way you can control the specific text you wish to bold
Full example:
from docx import Document
from docx.shared import Inches
import os
import re
def is_timestamp(line):
# it's flaky, I saw you have your own method and probably you did a better job parsing this.
return re.match(r'^\d{2}:\d{2}:\d{2}', line) is not None
def parse_raw_script(raw_script):
current_timestamp = ''
current_content = ''
for line in raw_script.splitlines():
line = line.strip()
if is_timestamp(line):
if current_timestamp:
yield {
'timestamp': current_timestamp,
'content': current_content
}
current_timestamp = line
current_content = ''
continue
if current_content:
current_content += '\n'
current_content += line
if current_timestamp:
yield {
'timestamp': current_timestamp,
'content': current_content
}
def should_bold(line):
# i leave it to you to replace with your logic
return line.startswith('a:')
def load_raw_script():
# I placed here the example from your question. read from file instead I presume
return '''01:02:10.3
a: Lorem ipsum dolor sit amet,
b: consectetur adipiscing elit.
a: Mauris a turpis erat.
01:02:20.4
a: Vivamus dignissim aliquam
b: Nam ultricies'''
def convert_raw_script_to_docx(raw_script, output_file_path):
document = Document()
table = document.add_table(rows=1, cols=3, style="Table Grid")
# add header row
header_row = table.rows[0]
header_row.cells[0].text = ''
header_row.cells[1].text = 'A'
header_row.cells[2].text = 'B'
# parse the raw script into something iterable
script_rows = parse_raw_script(raw_script)
# create a row for each timestamp row
for script_row in script_rows:
timestamp = script_row['timestamp']
content = script_row['content']
row = table.add_row()
timestamp_cell = row.cells[1]
timestamp_cell.text = timestamp
content_cell = row.cells[2]
content_paragraph = content_cell.paragraphs[0] # using the cell's default paragraph here instead of creating one
for line in content.splitlines():
run = content_paragraph.add_run(line)
if should_bold(line):
run.bold = True
run.add_break()
# resize table columns (optional)
for row in table.rows:
row.cells[0].width = Inches(0.2)
row.cells[1].width = Inches(1.9)
row.cells[2].width = Inches(3.9)
document.save(output_file_path)
def main():
script_dir = os.path.dirname(__file__)
dist_dir = os.path.join(script_dir, 'dist')
if not os.path.isdir(dist_dir):
os.makedirs(dist_dir)
output_file_path = os.path.join(dist_dir, 'so-template.docx')
raw_script = load_raw_script()
convert_raw_script_to_docx(raw_script, output_file_path)
if __name__ == '__main__':
main()
Result (file should be in ./dist/so-template.docx
):
BTW - if you prefer sticking with your own example, this is what needs to be changed:
IS_BOLD = {
'a': True,
'b': False
}
row_cells = table.add_row().cells
for line in lines:
if is_timestamp(line):
if row_cells[1]:
row_cells = table.add_row().cells
row_cells[0].text = line
else:
run = row_cells[1].paragraphs[0].add_run(line)
if IS_BOLD[line.split(":")[0]]:
run.bold = True
run.add_break()