I have some .docx files that are very specifically formatted.
I have copied the file 5 times to represent the 5 different strings that I require to be "found" and everything else removed.
#! python 3
import docx
import os
import shutil
import readDocx as rD
def delete_paragraph(paragraph):
p = paragraph._element
p._p = p._element = None
#Select the file you want to work with
fP = rD.file
#get the working directory for the file
nfP = os.path.dirname(os.path.abspath(fP))
#print (nfP)
#Break the filepath into parts
fileSplit = fP.split('/')
#Get the filename only
fileCode = fileSplit[-1]
#print (fileCode)
#Seperate the course code
nameSplit = fileCode.split(' ')
courseCode = nameSplit[0]
#print (courseCode)
#List of files that we need to create
a1 = "Assessment Summary"
a2 = "Back to Business project"
a3 = "Back to Business Checklist"
a4 = "Skills Demonstration"
a5 = "Skills Demonstration Checklist"
names = [a1, a2, a3, a4, a5]
#Creates a list for the new filenames to sit in
newFiles = []
#Creates the files from the original
for name in names:
fileName = os.path.join(nfP + '\\' + courseCode + ' ' + str(name) + ' ' +'Version 1.0' + '.docx')
shutil.copy(fP, fileName)
#print (newFiles)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
This is where I start to fail in my limited skill. The h1-5 tags represent the heading of the document pieces that I want to keep. How can I iterate through the document, find the heading and delete everything before / after these paragraphs? I don't necessarily need the answer, just more of a "look in this direction".
Try this. Have clearly mentioned in the comments what the code does.
from docx import Document #Package "Python-docx" needs to be installed to import this
import pandas as pd
# Read the document into a python-docx Document object
document = Document('Path/to/your/input/.docx/document')
#Initialize an empty dataframe to store the .docx document into a dataframe along with the style of each paragraph
document_text_dataframe = pd.DataFrame(columns=['para_text','style'])
#Iterate through the "document" object for extracting the paragraph texts along with their styles into the dataframe "document_text_dataframe"
for para in document.paragraphs:
#Extract paragraph style
style = str(para.style.name)
##### For headings which are created as NORMAL style but are BOLD, we need to extract them as well-
##### Ideally these represent headings as well.
runboldtext = ''
for run in para.runs:
if run.bold:
runboldtext = runboldtext + run.text
if runboldtext == str(para.text) and runboldtext != '':
print("Bold True for:",runboldtext)
style = 'Heading'
dftemp = pd.DataFrame({'para_text':[para.text],'style':[style]})
document_text_dataframe=document_text_dataframe.append(dftemp,sort=False) # Now append each paragraph along with its style into "document_text_dataframe"
document_text_dataframe = document_text_dataframe.reset_index(drop=True)
#Need to iterate through the files and start deleting data.
h1 = "Learner Declaration"
h2 = "Back to Business Project"
h3 = "Assessor Observation Checklist / Marking Guide"
h4 = "Skills Demonstration"
h5 = "Assessor Observation Checklist / Marking Guide"
h_list = [h1,h2,h3,h4]
#Initialize a list to store the extracted information relevant to each "h" value and store them in it
for h in h_list:
df_temp = pd.DataFrame(columns=['para_text','style'])
###########Loop through the document to extract the content related to each "h" value######
for index, row in document_text_dataframe.iterrows():
if h == row['para_text']:
print("Found match in document for: ",h)
start_index = index
print("Matching index=",index)
if start_index != 0:
for i in range(start_index+1,len(document_text_dataframe)-1):
if 'Heading' in document_text_dataframe.loc[i,'style']:
end_index = i
if end_index !=0:
for i in range(start_index,end_index):
df_temp = df_temp.append(document_text_dataframe.loc[i])
#Append every extracted content into the list "extracted_content"
if start_index != 0 and end_index!=0:
#The list "extracted_content" will consist of dataframes. Each dataframe will correspond to the extracted information of each "h" value.
Now, using extracted_content
, you can write every entry in the list extracted_content
to a separate .docx document using your code.