Search code examples
pythonpython-docx

Extract Acrobat Document Object from a table in .docx file in Python


I have a .docx file with a table in it. There are cells which contain an embedded Acrobat Document Object.

I'm using python-docx module to read and extract data from the .docx file, but it doesn't get those embedded documents (it returns an empty string when asked for the cell value).

As I don't see myself good enough to try to modify the module source code, I've thought of extracting the embedded documents from the .docx file itself (by [changing extension to .zip][1]), but the pdf file appears to be in a .bin format.

So I'm thinking about four possible solutions you could guys help me with:

  1. Get the python-docx module to recognise the Acrobat Document Object embedded.
  2. Get Python to convert the .bin files to pdf.
  3. Maybe recommend me another Python module which can extract tables (and obviously its data) from .docx files and supports Acrobat Document Objects as well.
  4. (the one I don't really fancy) I've seen some Visual Studio or macro code (not sure what it was, I'll paste it at the end) which is said to actually work well extracting pdf embedded files. The thing is I'm not really into Visual Studio stuff so I'd need help in executing a macro from Python (with plain language and having in mind I'm not a Visual Studio expert).

The method has to be done with Python (or another not-so-complex automated method) since this procedure will have to be done multiple times and manual extraction is completely discarded.

Thank you a lot in advance.

Visual Studio code I saw on the Internet (copy-pasted):

Sub ExtractEmbeddedDocs()
Dim MyObj As Object
Dim xlApp As Object
Dim xlWkb As Object

Dim myshape As Word.InlineShape
Dim myFormat As Word.WdSaveFormat
Dim WDDoc As Word.Document
'    Dim embedObj As OLEObject

Dim FileExtStr As String
Dim FileFormatNum As Long
Dim StrInFold As String, StrOutFold As String
Dim StrDocFile As String, Obj_App As Object, i As Long
Dim StrFile As String, StrFileList As String, StrMediaFile As String, j As Long
Dim outFileName As String
Dim SBar As Boolean
Dim exten As String
Dim embedCount As Integer, wordCount As Integer, excelCount As Integer, visioCount As Integer, pptCount As Integer
Dim pdfCount As Integer
Dim msg As String, temp As String
Dim ok As Boolean

Dim docs As Variant, doc As Variant    ', temp As Variant
Dim AcroApp As Acrobat.CAcroApp
Dim AcroPDDoc As Acrobat.CAcroPDDoc
Dim AcroAVDoc As Acrobat.CAcroAVDoc
Dim jso As Object


StrInFold = ActiveDocument.Path
If StrInFold = "" Then Exit Sub
' Store current Status Bar status, then switch on
SBar = Application.DisplayStatusBar
Application.DisplayStatusBar = True
StrOutFold = StrInFold & "\Embedded Files"

Application.ScreenUpdating = False

'    On Error GoTo error_handler

'Test for existing output folder, create if they don't already exist
If Dir(StrOutFold, vbDirectory) = "" Then MkDir StrOutFold

embedCount = ActiveDocument.InlineShapes.Count

' This opens the embedded documents, each in their own instance of the program
For Each myshape In ActiveDocument.InlineShapes
If (myshape.Type = wdInlineShapeEmbeddedOLEObject) Then
If (InStr(myshape.OLEFormat.ClassType, "Word") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded word doc
' Now I want to save it

Set MyObj = GetObject(, "Word.Application")
If MyObj Is Nothing Then
' Word is not running, create new instance
Set MyObj = CreateObject("Word.Application")
End If

Set WDDoc = MyObj.ActiveDocument


' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))
If Right(temp, 3) = "doc" Then
myFormat = wdFormatDocument
Else
Select Case Right(temp, 4)
Case "docx"
myFormat = wdFormatXMLDocument
Case "docm"
myFormat = wdFormatXMLDocumentMacroEnabled
Case Else
myFormat = wdFormatDocument
End Select
End If

outFileName = StrOutFold & "\" + temp '
WDDoc.SaveAs2 FileName:=outFileName, FileFormat:=myFormat   'CompatibilityMode:=12    '
WDDoc.Close savechanges:=False 'MyObj.Application.Documents.Item(1)

wordCount = wordCount + 1

Set WDDoc = Nothing
'                MyObj.Quit ' will quit the existing instance of Word, don't do!
Set MyObj = Nothing
End If

If (InStr(myshape.OLEFormat.ClassType, "Visio") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded Visio file
' Now I want to save it
Set MyObj = GetObject(, "Visio.Application")
If MyObj Is Nothing Then
' Visio is not running, create new instance
Set MyObj = CreateObject("Visio.Application")
End If

'Debug.Print MyObj.Application.Documents.Item(1) ' Prints the filename
' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))

outFileName = StrOutFold & "\" + temp '
MyObj.Application.Documents.Item(1).SaveAs FileName:=outFileName
MyObj.Application.Documents.Item(1).Close 'savechanges:=False

visioCount = visioCount + 1
MyObj.Quit
Set MyObj = Nothing
End If

If (InStr(myshape.OLEFormat.ClassType, "Excel") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded Excel
' Now I want to save it
Set xlApp = GetObject(, "Excel.Application")
If xlApp Is Nothing Then
' Excel is not running, create new instance
Set xlApp = CreateObject("Excel.Application")
End If
Set xlWkb = xlApp.Workbooks(1)

' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))

outFileName = StrOutFold & "\" + temp '

' find file format from extension
With xlApp.ActiveWorkbook
If Val(xlApp.Application.Version) < 12 Then
'You use Excel 97-2003
FileExtStr = ".xls": FileFormatNum = -4143
Else
'You use Excel 2007-2013
Select Case .FileFormat
Case 51: FileExtStr = ".xlsx": FileFormatNum = 51
Case 52:
If .HasVBProject Then
FileExtStr = ".xlsm": FileFormatNum = 52
Else
FileExtStr = ".xlsx": FileFormatNum = 51
End If
Case 56: FileExtStr = ".xls": FileFormatNum = 56
Case Else: FileExtStr = ".xlsb": FileFormatNum = 50
End Select
End If
End With

xlApp.Application.DisplayAlerts = False
xlApp.ActiveWorkbook.SaveAs FileName:=outFileName, FileFormat:=FileFormatNum 'xlWorkbookNormal -4143 xlWorkbookDefault 51
xlApp.ActiveWorkbook.Close savechanges:=False
Set xlWkb = Nothing
xlApp.Quit
Set xlApp = Nothing

excelCount = excelCount + 1
End If

If (InStr(myshape.OLEFormat.ClassType, "PowerPoint") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first powerpoint file
' Now I want to save it
Set MyObj = GetObject(, "PowerPoint.Application")
If MyObj Is Nothing Then
' Powerpoint is not running, create new instance
Set MyObj = CreateObject("PowerPoint.Application")
End If

' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))

outFileName = StrOutFold & "\" + temp '
myshape.OLEFormat.Object.SaveAs FileName:=outFileName
myshape.OLEFormat.Object.Close 'savechanges:=False

pptCount = pptCount + 1
MyObj.Quit
Set MyObj = Nothing
End If

If (InStr(myshape.OLEFormat.ClassType, "Acro") > 0) Then
myshape.OLEFormat.DoVerb (wdOLEVerbOpen) ' Open the first embedded pdf

myshape.OLEFormat.Activate  ' probably not needed

Set AcroAVDoc = CreateObject("AcroExch.AVDoc")

Set AcroApp = CreateObject("AcroExch.App")
'                If AcroApp Is Nothing Then
'                ' Acrobat is not running, create new instance
'                    Set AcroApp = CreateObject("AcroExch.App")
'                End If

Set AcroAVDoc = AcroApp.GetActiveDoc    ' get the logical doc
Set AcroPDDoc = AcroAVDoc.GetPDDoc      ' get the physical doc

'some code I found (KHK) for working with the javascript bridge, not needed here
'                Set jso = AcroPDDoc.GetJSObject     ' get the javascript bridge

'                docs = jso.app.activeDocs       ' get array of active docs
'
'                For Each doc In docs
'                    If doc.documentFileName = AcroPDDoc.GetFileName Then
'                        ' insert template document
'                    End If
'
'                Next

' older Word files had " on each end of the IconLabel of the embedded file, resulting in a path name with "" in it
' this was causing an error 4148 when the SaveAs line executed. Go figure :)
temp = Trim(Replace(Replace(myshape.OLEFormat.IconLabel, Chr(34), ""), Chr(34), ""))

outFileName = StrOutFold & "\" + temp '

If AcroPDDoc.Save(PDSaveFull, outFileName) = False Then
MsgBox "Cannot save document"
End If
AcroAVDoc.Close (1)
AcroPDDoc.Close

pdfCount = pdfCount + 1

AcroApp.Exit
Set AcroApp = Nothing
Set AcroAVDoc = Nothing
Set AcroPDDoc = Nothing
End If

End If
Next myshape

' Clear the Status Bar
Application.StatusBar = False
' Restore original Status Bar status
Application.DisplayStatusBar = SBar
Application.ScreenUpdating = True

temp = "Embedded file counts" & vbCrLf & "Total " & vbTab & vbTab & embedCount & vbCrLf & "Word Files " & vbTab & wordCount & vbCrLf & _
"Excel Files " & vbTab & vbTab & excelCount & vbCrLf & "Visio Files " & vbTab & vbTab & visioCount & vbCrLf & "PowerPoint Files " & vbTab & pptCount & vbCrLf
temp = temp & "PDF Files " & vbTab & vbTab & pdfCount & vbCrLf & "Unknown files" & vbTab & embedCount - (wordCount + excelCount + visioCount)
msg = temp

msg = msg & vbCrLf & vbCrLf & "You should have " & vbTab & (wordCount + excelCount + visioCount + pptCount + pdfCount) & " files"

MsgBox msg, vbInformation + vbOKOnly

Exit Sub

error_handler:

If Err.Number Then
MsgBox Err.Number & "  " & Err.Description, vbCritical + vbOKOnly
End If

If Err.Number = 1004 Then
If Err.Description = "No cells were found." Then
'            GoTo get_filename
ElseIf Err.Description = "You cannot save this workbook with the " & _
"same name as another open workbook or " & _
"add-in. Choose a different name, or " & _
"close the other workbook or add-in " & _
"before saving." Then
MsgBox "There is another file with the same name " & _
"already open.  Please chose a different name " & _
"for this file."
'            GoTo get_filename
End If
End If

Exit Sub

pdfError:
MsgBox "PDF files require Adobe Acrobat (not Reader) to work"
Resume
End Sub

  [1]: https://www.howtogeek.com/50628/easily-extract-images-text-and-embedded-files-from-an-office-2007-document/

Solution

  • If you are still looking for an answer, I found a python package called olefile. Try something like this:

    #%% Pull pdfs from word document
    
    import olefile
    from zipfile import ZipFile
    from glob import glob
    
    # How many PDF documents have we saved
    pdf_count = 0
    
    # Loop through all the .docx files in the current folder
    for filename in glob("*.docx"):
      try:
        # Try to open the document as ZIP file
        with ZipFile(filename, "r") as zip:
    
          # Find files in the word/embeddings folder of the ZIP file
          for entry in zip.infolist():
            if not entry.filename.startswith("word/embeddings/"):
              continue
    
            # Try to open the embedded OLE file
            with zip.open(entry.filename) as f:
              if not olefile.isOleFile(f):
                continue
    
              ole = olefile.OleFileIO(f)
    
              # CLSID for Adobe Acrobat Document
              if ole.root.clsid != "B801CA65-A1FC-11D0-85AD-444553540000":
                continue
    
              if not ole.exists("CONTENTS"):
                continue
    
              # Extract the PDF from the OLE file
              pdf_data = ole.openstream('CONTENTS').read()
    
              # Does the embedded file have a %PDF- header?
              if pdf_data[0:5] == b'%PDF-':
                pdf_count += 1
    
                pdf_filename = "Document %d.pdf" % pdf_count
    
                # Save the PDF
                with open(pdf_filename, "wb") as output_file:
                  output_file.write(pdf_data)
    
      except:
        print("Unable to open '%s'" % filename)
    
    print("Extracted %d PDF documents" % pdf_count)