I have a .docx file with a table in it. There are cells which contain an embedded Acrobat Document Object.
I'm using python-docx
module to read and extract data from the .docx file, but it doesn't get those embedded documents (it returns an empty string when asked for the cell value).
As I don't see myself good enough to try to modify the module source code, I've thought of extracting the embedded documents from the .docx file itself (by [changing extension to .zip][1]), but the pdf file appears to be in a .bin format.
So I'm thinking about four possible solutions you could guys help me with:
python-docx
module to recognise the Acrobat Document Object embedded.The method has to be done with Python (or another not-so-complex automated method) since this procedure will have to be done multiple times and manual extraction is completely discarded.
Thank you a lot in advance.
Visual Studio code I saw on the Internet (copy-pasted):
Sub ExtractEmbeddedDocs()
Dim MyObj As Object
Dim xlApp As Object
Dim xlWkb As Object
Dim myshape As Word.InlineShape
Dim myFormat As Word.WdSaveFormat
Dim WDDoc As Word.Document
' Dim embedObj As OLEObject
Dim FileExtStr As String
Dim FileFormatNum As Long
Dim StrInFold As String, StrOutFold As String
Dim StrDocFile As String, Obj_App As Object, i As Long
Dim StrFile As String, StrFileList As String, StrMediaFile As String, j As Long
Dim outFileName As String
Dim SBar As Boolean
Dim exten As String
Dim embedCount As Integer, wordCount As Integer, excelCount As Integer, visioCount As Integer, pptCount As Integer
Dim pdfCount As Integer
Dim msg As String, temp As String
Dim ok As Boolean
Dim docs As Variant, doc As Variant ', temp As Variant
Dim AcroApp As Acrobat.CAcroApp
Dim AcroPDDoc As Acrobat.CAcroPDDoc
Dim AcroAVDoc As Acrobat.CAcroAVDoc
Dim jso As Object
StrInFold = ActiveDocument.Path
If StrInFold = "" Then Exit Sub
' Store current Status Bar status, then switch on
SBar = Application.DisplayStatusBar
Application.DisplayStatusBar = True
StrOutFold = StrInFold & "\Embedded Files"
Application.ScreenUpdating = False
' On Error GoTo error_handler
'Test for existing output folder, create if they don't already exist
If Dir(StrOutFold, vbDirectory) = "" Then MkDir StrOutFold
embedCount = ActiveDocument.InlineShapes.Count
' This opens the embedded documents, each in their own instance of the program
For Each myshape In ActiveDocument.InlineShapes
If (myshape.Type = wdInlineShapeEmbeddedOLEObject) Then
If (InStr(myshape.OLEFormat.ClassType, "Word") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded word doc
' Now I want to save it
Set MyObj = GetObject(, "Word.Application")
If MyObj Is Nothing Then
' Word is not running, create new instance
Set MyObj = CreateObject("Word.Application")
End If
Set WDDoc = MyObj.ActiveDocument
' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))
If Right(temp, 3) = "doc" Then
myFormat = wdFormatDocument
Else
Select Case Right(temp, 4)
Case "docx"
myFormat = wdFormatXMLDocument
Case "docm"
myFormat = wdFormatXMLDocumentMacroEnabled
Case Else
myFormat = wdFormatDocument
End Select
End If
outFileName = StrOutFold & "\" + temp '
WDDoc.SaveAs2 FileName:=outFileName, FileFormat:=myFormat 'CompatibilityMode:=12 '
WDDoc.Close savechanges:=False 'MyObj.Application.Documents.Item(1)
wordCount = wordCount + 1
Set WDDoc = Nothing
' MyObj.Quit ' will quit the existing instance of Word, don't do!
Set MyObj = Nothing
End If
If (InStr(myshape.OLEFormat.ClassType, "Visio") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded Visio file
' Now I want to save it
Set MyObj = GetObject(, "Visio.Application")
If MyObj Is Nothing Then
' Visio is not running, create new instance
Set MyObj = CreateObject("Visio.Application")
End If
'Debug.Print MyObj.Application.Documents.Item(1) ' Prints the filename
' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))
outFileName = StrOutFold & "\" + temp '
MyObj.Application.Documents.Item(1).SaveAs FileName:=outFileName
MyObj.Application.Documents.Item(1).Close 'savechanges:=False
visioCount = visioCount + 1
MyObj.Quit
Set MyObj = Nothing
End If
If (InStr(myshape.OLEFormat.ClassType, "Excel") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded Excel
' Now I want to save it
Set xlApp = GetObject(, "Excel.Application")
If xlApp Is Nothing Then
' Excel is not running, create new instance
Set xlApp = CreateObject("Excel.Application")
End If
Set xlWkb = xlApp.Workbooks(1)
' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))
outFileName = StrOutFold & "\" + temp '
' find file format from extension
With xlApp.ActiveWorkbook
If Val(xlApp.Application.Version) < 12 Then
'You use Excel 97-2003
FileExtStr = ".xls": FileFormatNum = -4143
Else
'You use Excel 2007-2013
Select Case .FileFormat
Case 51: FileExtStr = ".xlsx": FileFormatNum = 51
Case 52:
If .HasVBProject Then
FileExtStr = ".xlsm": FileFormatNum = 52
Else
FileExtStr = ".xlsx": FileFormatNum = 51
End If
Case 56: FileExtStr = ".xls": FileFormatNum = 56
Case Else: FileExtStr = ".xlsb": FileFormatNum = 50
End Select
End If
End With
xlApp.Application.DisplayAlerts = False
xlApp.ActiveWorkbook.SaveAs FileName:=outFileName, FileFormat:=FileFormatNum 'xlWorkbookNormal -4143 xlWorkbookDefault 51
xlApp.ActiveWorkbook.Close savechanges:=False
Set xlWkb = Nothing
xlApp.Quit
Set xlApp = Nothing
excelCount = excelCount + 1
End If
If (InStr(myshape.OLEFormat.ClassType, "PowerPoint") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first powerpoint file
' Now I want to save it
Set MyObj = GetObject(, "PowerPoint.Application")
If MyObj Is Nothing Then
' Powerpoint is not running, create new instance
Set MyObj = CreateObject("PowerPoint.Application")
End If
' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))
outFileName = StrOutFold & "\" + temp '
myshape.OLEFormat.Object.SaveAs FileName:=outFileName
myshape.OLEFormat.Object.Close 'savechanges:=False
pptCount = pptCount + 1
MyObj.Quit
Set MyObj = Nothing
End If
If (InStr(myshape.OLEFormat.ClassType, "Acro") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded pdf
myshape.OLEFormat.Activate ' probably not needed
Set AcroAVDoc = CreateObject("AcroExch.AVDoc")
Set AcroApp = CreateObject("AcroExch.App")
' If AcroApp Is Nothing Then
' ' Acrobat is not running, create new instance
' Set AcroApp = CreateObject("AcroExch.App")
' End If
Set AcroAVDoc = AcroApp.GetActiveDoc ' get the logical doc
Set AcroPDDoc = AcroAVDoc.GetPDDoc ' get the physical doc
'some code I found (KHK) for working with the javascript bridge, not needed here
' Set jso = AcroPDDoc.GetJSObject ' get the javascript bridge
' docs = jso.app.activeDocs ' get array of active docs
'
' For Each doc In docs
' If doc.documentFileName = AcroPDDoc.GetFileName Then
' ' insert template document
' End If
'
' Next
' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))
outFileName = StrOutFold & "\" + temp '
If AcroPDDoc.Save(PDSaveFull, outFileName) = False Then
MsgBox "Cannot save document"
End If
AcroAVDoc.Close (1)
AcroPDDoc.Close
pdfCount = pdfCount + 1
AcroApp.Exit
Set AcroApp = Nothing
Set AcroAVDoc = Nothing
Set AcroPDDoc = Nothing
End If
End If
Next myshape
' Clear the Status Bar
Application.StatusBar = False
' Restore original Status Bar status
Application.DisplayStatusBar = SBar
Application.ScreenUpdating = True
temp = "Embedded file counts" & vbCrLf & "Total " & vbTab & vbTab & embedCount & vbCrLf & "Word Files " & vbTab & wordCount & vbCrLf & _
"Excel Files " & vbTab & vbTab & excelCount & vbCrLf & "Visio Files " & vbTab & vbTab & visioCount & vbCrLf & "PowerPoint Files " & vbTab & pptCount & vbCrLf
temp = temp & "PDF Files " & vbTab & vbTab & pdfCount & vbCrLf & "Unknown files" & vbTab & embedCount - (wordCount + excelCount + visioCount)
msg = temp
msg = msg & vbCrLf & vbCrLf & "You should have " & vbTab & (wordCount + excelCount + visioCount + pptCount + pdfCount) & " files"
MsgBox msg, vbInformation + vbOKOnly
Exit Sub
error_handler:
If Err.Number Then
MsgBox Err.Number & " " & Err.Description, vbCritical + vbOKOnly
End If
If Err.Number = 1004 Then
If Err.Description = "No cells were found." Then
' GoTo get_filename
ElseIf Err.Description = "You cannot save this workbook with the " & _
"same name as another open workbook or " & _
"add-in. Choose a different name, or " & _
"close the other workbook or add-in " & _
"before saving." Then
MsgBox "There is another file with the same name " & _
"already open. Please chose a different name " & _
"for this file."
' GoTo get_filename
End If
End If
Exit Sub
pdfError:
MsgBox "PDF files require Adobe Acrobat (not Reader) to work"
Resume
End Sub
[1]: https://www.howtogeek.com/50628/easily-extract-images-text-and-embedded-files-from-an-office-2007-document/
If you are still looking for an answer, I found a python package called olefile. Try something like this:
#%% Pull pdfs from word document
import olefile
from zipfile import ZipFile
from glob import glob
# How many PDF documents have we saved
pdf_count = 0
# Loop through all the .docx files in the current folder
for filename in glob("*.docx"):
try:
# Try to open the document as ZIP file
with ZipFile(filename, "r") as zip:
# Find files in the word/embeddings folder of the ZIP file
for entry in zip.infolist():
if not entry.filename.startswith("word/embeddings/"):
continue
# Try to open the embedded OLE file
with zip.open(entry.filename) as f:
if not olefile.isOleFile(f):
continue
ole = olefile.OleFileIO(f)
# CLSID for Adobe Acrobat Document
if ole.root.clsid != "B801CA65-A1FC-11D0-85AD-444553540000":
continue
if not ole.exists("CONTENTS"):
continue
# Extract the PDF from the OLE file
pdf_data = ole.openstream('CONTENTS').read()
# Does the embedded file have a %PDF- header?
if pdf_data[0:5] == b'%PDF-':
pdf_count += 1
pdf_filename = "Document %d.pdf" % pdf_count
# Save the PDF
with open(pdf_filename, "wb") as output_file:
output_file.write(pdf_data)
except:
print("Unable to open '%s'" % filename)
print("Extracted %d PDF documents" % pdf_count)