Search code examples
vbapdftextacrobatpdftotext

PDF to plain text, Some difficult pages were encountered Adobe Acrobat XI


Basic Problem: For this PDF: https://1drv.ms/u/s!AsrLaUgt0KCLhXtP-jYDd4Z0ujKQ?e=xSu2ZR

I am unable to convert/Save manually as plain text using Adobe Acrobat XI standard or the batch conversion script (below). The generated file is blank.

Full problem: As part of my attempts to batch convert PDFs to text, I have run into a strange error where acrobat XI returns the following:

enter image description here

Disappointingly clicking ok generates the text file blank.

The following script to loop through PDF files and convert them to text files using acrobat: It works fine for most PDFs except ones with figures like above.

Sub LoopThroughFiles()
    Dim StrFile As String
    Dim pdfPath As String
    
    StrFile = Dir("C:\temp\PDFs\")
    fileRoot = "C:\temp\PDFs\"
    If Right(fileRoot, 1) <> "\" Then fileRoot = fileRoot & "\" 'ensure terminating \
    
    Do While Len(StrFile) > 0
        Debug.Print StrFile
        pdfPath = fileRoot & StrFile
        
        Debug.Print pdfPath
        
        success = ConvertPdf2(pdfPath, fileRoot & StrFile & ".txt")
        
        StrFile = Dir
        
        On Error Resume Next
        
        
    Loop
End Sub


'returns true if conversion was successful (based on whether `Open` succeeded or not)
Function ConvertPdf2(pdfPath As String, textPath As String) As Boolean
    Dim AcroXApp As Acrobat.AcroApp
    Dim AcroXAVDoc As Acrobat.AcroAVDoc
    Dim AcroXPDDoc As Acrobat.AcroPDDoc
    Dim jsObj As Object, success As Boolean

    Set AcroXApp = CreateObject("AcroExch.App")
    Set AcroXAVDoc = CreateObject("AcroExch.AVDoc")
    success = AcroXAVDoc.Open(pdfPath, "Acrobat") '<<< returns false if fails
    If success Then
    
Application.Wait (Now + TimeValue("0:00:2")) 'Helps PC have some time to go through data, can cause PC to freeze without


        Set AcroXPDDoc = AcroXAVDoc.GetPDDoc
        Set jsObj = AcroXPDDoc.GetJSObject
        jsObj.SaveAs textPath, "com.adobe.acrobat.plain-text"
        AcroXAVDoc.Close False
    End If
    AcroXApp.Hide
    AcroXApp.Exit
    ConvertPdf2 = success 'report success/failure
End Function

The error appears to be jsObj.SaveAs textPath, "com.adobe.acrobat.plain-text" If instead I use jsObj.SaveAs textPath, "com.adobe.acrobat.accesstext" the text file is generated but for my needs it is important the file generates is in the plain text format.

The reason for this can be seen below in a different PDF. These are the different types of text files generated:

Plain text (extends as sentences in the horizontal direction - this is required): enter image description here

Access Text: (creates more of a body of text - this separated sentences by carriage return and is problematic) enter image description here

I reckon this is a lost cause for these sorts of PDFs; disappointing, though, as many of the PDFs I need to convert are in this format. Appear to have been plagued with issues trying to solve this one.

Anyway just wondered if it may be possible to disable the popup message, and maybe this will allow the plain-text write to occur?

Alternatively can't think of much else.


Solution

  • From: Plain Text From PDF without inserting line breaks but retaining carriage returns using VBA. Working solution but requires improvement

    Change: Encoding:=1252 to 65001 for unusual characters.

    Sub LoopThroughFiles()
        
        Dim StrFile As String
        Dim pdfPath As String
        
        StrFile = Dir("C:\temp\PDFs\")
        fileRoot = "C:\temp\PDFs\"
        If Right(fileRoot, 1) <> "\" Then fileRoot = fileRoot & "\" 'ensure terminating \
        
        Do While Len(StrFile) > 0
            
            Debug.Print StrFile
            n = StrFile
            pdfPath = fileRoot & StrFile
            
            Debug.Print pdfPath
            
        'Convert to WordDoc
        success = ConvertPdf2(pdfPath, fileRoot & StrFile & ".doc")
        StrFile = Dir
        On Error Resume Next
            
        oWd.Quit
            
        'Convert to PlainText
        Debug.Print pdfPath & ".doc"
    
        success2 = GetTextFromWord(pdfPath & ".doc", n)
        
    Loop
    End Sub
    
    'returns true if conversion was successful (based on whether `Open` succeeded or not)
    Function ConvertPdf2(pdfPath As String, textPath As String) As Boolean
        Dim AcroXApp As Acrobat.AcroApp
        Dim AcroXAVDoc As Acrobat.AcroAVDoc
        Dim AcroXPDDoc As Acrobat.AcroPDDoc
        Dim jsObj As Object, success As Boolean
    
        Set AcroXApp = CreateObject("AcroExch.App")
        Set AcroXAVDoc = CreateObject("AcroExch.AVDoc")
        success = AcroXAVDoc.Open(pdfPath, "Acrobat") '<<< returns false if fails
        If success Then
        
    Application.Wait (Now + TimeValue("0:00:2")) 'Helps PC have some time to go through data, can cause PC to freeze without
    
            Set AcroXPDDoc = AcroXAVDoc.GetPDDoc
            Set jsObj = AcroXPDDoc.GetJSObject
            jsObj.SaveAs textPath, "com.adobe.acrobat.doc"
            AcroXAVDoc.Close False
        End If
        AcroXApp.Hide
        AcroXApp.Exit
        ConvertPdf2 = success 'report success/failure
    End Function
    
    Function GetTextFromWord(DocStr As String, n)
    
        Dim filePath As String
        Dim fso As FileSystemObject
        Dim fileStream As TextStream
        Dim oWd As Object, oDoc As Object, fileRoot As String
        Const wdFormatText As Long = 2, wdCRLF As Long = 0
        
        Set fso = New FileSystemObject
        Set oWd = CreateObject("word.application")
        
        fileRoot = "C:\temp\PDFs" 'read this once
        If Right(fileRoot, 1) <> "\" Then fileRoot = fileRoot & "\" 'ensure terminating \
        
                Set oDoc = Nothing
                On Error Resume Next 'ignore error if no document...
                Set oDoc = oWd.Documents.Open(DocStr)
                On Error GoTo 0      'stop ignoring errors
                
                Debug.Print n
                If Not oDoc Is Nothing Then
                    filePath = fileRoot & n & ".txt"  'filename
                    Debug.Print filePath
                    
                    
            oDoc.SaveAs2 Filename:=filePath, _
            FileFormat:=wdFormatText, LockComments:=False, Password:="", _
            AddToRecentFiles:=True, WritePassword:="", ReadOnlyRecommended:=False, _
            EmbedTrueTypeFonts:=False, SaveNativePictureFormat:=False, SaveFormsData _
            :=False, SaveAsAOCELetter:=False, Encoding:=1252, InsertLineBreaks:=False _
            , AllowSubstitutions:=True, LineEnding:=wdCRLF, CompatibilityMode:=0
            
        oDoc.Close False
        
        End If
        oWd.Quit
                    
       
        GetTextFromWord = success2
        
    End Function