Search code examples
vbaloopsforeachms-wordnested-loops

Determine if a word is within 125 words of a match


I want to loop through a document, and for each word, see if there is a match within 250 words (125 behind and 125 ahead).

If there is a match(s), highlight it. Certain words are excluded. These are stored in a dictionary.

To test the loop I am using,

For Each para In ActiveDocument.Paragraphs
    For Each wrd In para.Range.Words
        Debug.Print wrd & "----" & wrd.Start
    Next wrd
Next para

The problem:

"World" in the sentence "I hate traveling to the spirit world", prints 32 when I am looking for 7.

I want to do something like:

If wrd < 125 Then
    Set wrdRng = ActiveDocument.Range(Start:=wrd - 125, End:=ActiveDocument.Words(wrd + 125).End)
Else 
    Set wrdRng = ActiveDocument.Range(Start:=0, End:=ActiveDocument.Words(250 - wrd).End)
End if

Edit:

The current code I'm using completes a loop on a 50,000 word document in about 13 minutes. That is entirely too long. Anyone have a better alternative?

Sub Demo()
Application.ScreenUpdating = False
Dim i As Long
Dim StartTime As Double
Dim MinutesElapsed As String


StartTime = Timer

For Each Para In ActiveDocument.Paragraphs
For Each wrd In Para.Range.Words
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Text = "<(McKnight)*\1>"
    .Replacement.Text = ""
    .Forward = True
    .Wrap = wdFindStop
    .Format = False
    .MatchCase = False
    .MatchWildcards = True
  End With
  Do While .Find.Execute
    If .ComputeStatistics(wdStatisticWords) < 100 Then
      i = i + 1
      .Words.First.HighlightColorIndex = wdBrightGreen
      .Words.Last.HighlightColorIndex = wdBrightGreen
    End If
    .End = .End - Len(.Words.Last)
    .Collapse wdCollapseEnd
  Loop
End With
Application.ScreenUpdating = True
'MsgBox i & " instances found."
Debug.Print wrd
    Next wrd
Next Para
MinutesElapsed = Format((Timer - StartTime) / 86400, "hh:mm:ss")
MsgBox "This code ran successfully in " & MinutesElapsed & " minutes", vbInformation

End Sub

EDIT:

Sub Demo()
Application.ScreenUpdating = False
Dim i As Long
Dim StartTime As Double
Dim MinutesElapsed As String


StartTime = Timer

For Each para In ActiveDocument.Paragraphs
For Each wrd In para.Range.Words
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Text = "<(wrd)*\1>"
    .Replacement.Text = ""
    .Forward = True
    .Wrap = wdFindStop
    .Format = False
    .MatchCase = False
    .MatchWildcards = True
  End With
  Do While .Find.Execute
    If .ComputeStatistics(wdStatisticWords) < 100 Then
      i = i + 1
      .Words.First.HighlightColorIndex = wdBrightGreen
      .Words.Last.HighlightColorIndex = wdBrightGreen
    End If
    .End = .End - Len(.Words.Last)
    .Collapse wdCollapseEnd
  Loop
End With
Application.ScreenUpdating = True
'MsgBox i & " instances found."
'Debug.Print wrd
    Next wrd
Next para
MinutesElapsed = Format((Timer - StartTime) / 86400, "hh:mm:ss")
MsgBox "This code ran successfully in " & MinutesElapsed & " minutes", vbInformation

End Sub

Solution

  • Try the following. Amongst other things, it allows you to specify words to ignore (e.g. prepositions, articles, etc.). Additionally different highlights are used to identify all 'hits' on a given word. A progress report is given on the status bar. On my laptop, it takes about 6:40 for a 50,000 word 'lorem' document.

    Option Explicit
    Dim ArrOut() As String
    
    Sub Demo()
    ' Turn Off Screen Updating
    Application.ScreenUpdating = False
    Dim eTime As Single
    ' Start Timing
    eTime = Timer
    Dim wdDoc As Document, StrFnd As String, StrTmp As String, Rng As Range
    Dim SBar As Boolean, bTrk As Boolean, h As Long, i As Long, j As Long
    ' Store current Status Bar status, then switch on
    SBar = Application.DisplayStatusBar
    Application.DisplayStatusBar = True
    Set wdDoc = ActiveDocument
    With wdDoc
      ' Store current Track Changes status, then switch off
      bTrk = .TrackRevisions: .TrackRevisions = False
      'Display status
       Application.StatusBar = "Building word list"
      'Compile the Find list
      Call BuildWordList(.Range.Text)
      With .Range.Find
        .ClearFormatting
        .Replacement.ClearFormatting
        .Text = ""
        .Replacement.Text = ""
        .Format = False
        .MatchCase = False
        .MatchWholeWord = True
        .Wrap = wdFindStop
        .Execute
      End With
    End With
    'Process all words in the concordance
    For i = 0 To UBound(ArrOut())
      StrFnd = ArrOut(i)
      h = i Mod 14
      If h < 6 Then
        h = h + 2
      Else
        h = h + 3
      End If
      'Display current word
      Application.StatusBar = "Processing: " & StrFnd
      'Use wildcards, if possible, for extra speed
      If Len(StrFnd) < 4 Then
        StrTmp = ""
        For j = 1 To Len(StrFnd)
          StrTmp = StrTmp & "[" & UCase(Mid(StrFnd, j, 1)) & Mid(StrFnd, j, 1) & "]"
        Next
        StrFnd = StrTmp
        With wdDoc.Range
          With .Find
            .MatchWildcards = True
            .Text = "<(" & StrFnd & ")>*<(" & StrFnd & ")>"
            .Forward = True
            .Wrap = wdFindStop
          End With
          Do While .Find.Execute
            If .ComputeStatistics(wdStatisticWords) < 100 Then
              If .Words.First.HighlightColorIndex <> h Then .Words.First.HighlightColorIndex = h
              .Words.Last.HighlightColorIndex = h
            End If
            .End = .End - Len(.Words.Last)
            .Collapse wdCollapseEnd
          Loop
        End With
      Else
        With wdDoc.Range
          With .Find
            .MatchWildcards = False
            .Text = StrFnd
            .Forward = True
            .Wrap = wdFindStop
            .Execute
          End With
          Set Rng = .Duplicate
          Do While .Find.Execute
            Rng.End = .Duplicate.End
            With Rng
              If .ComputeStatistics(wdStatisticWords) < 100 Then
                If .Words.First.HighlightColorIndex <> h Then .Words.First.HighlightColorIndex = h
                .Words.Last.HighlightColorIndex = h
              End If
            End With
            Set Rng = .Duplicate
            .Collapse wdCollapseEnd
          Loop
        End With
      End If
      DoEvents
    Next
    ' Restore original Track Changes status
    wdDoc.TrackRevisions = bTrk
    ' Clear the Status Bar
    Application.StatusBar = False
    ' Restore original Status Bar status
    Application.DisplayStatusBar = SBar
    ' Restore Screen Updating
    Application.ScreenUpdating = True
    ' Calculate elapsed time
    eTime = (Timer - eTime + 86400) Mod 86400 ' Just in case execution time spans midnight
    MsgBox "Execution took " & Format(eTime / 86400, "hh:mm:ss") & " to process"
    End Sub
    
    Sub BuildWordList(StrIn As String)
    Dim StrFnd As String, i As Long, j As Long, k As Long
    'Define the exlusions list
    Const StrExcl As String = "a,am,and,are,as,at,be,but,by,can,cm,did,do,does,eg," & _
              "en,eq,etc,for,get,go,got,has,have,he,her,him,how,i,ie,if,in,into,is," & _
              "it,its,me,mi,mm,my,na,nb,no,not,of,off,ok,on,one,or,our,out,re,she," & _
              "so,the,their,them,they,t,to,was,we,were,who,will,would,yd,you,your"
     'Strip out unwanted characters
    For i = 1 To 255
      Select Case i
        Case 1 To 31, 33 To 64, 91 To 96, 123 To 144, 147 To 191, 247
        Do While InStr(StrIn, Chr(i)) > 0
          StrIn = Replace(StrIn, Chr(i), " ")
        Loop
      End Select
    Next
    'Convert smart single quotes to plain single quotes & delete any at the start/end of a word
    StrIn = Replace(Replace(Replace(Replace(StrIn, Chr(145), "'"), Chr(146), "'"), "' ", " "), " '", " ")
    'Convert to lowercase
    StrIn = " " & LCase(StrIn) & " "
    'Process the exclusions list
    For i = 0 To UBound(Split(StrExcl, ","))
      StrFnd = " " & Split(StrExcl, ",")(i) & " "
      Do While InStr(StrIn, StrFnd) > 0
        StrIn = Replace(StrIn, " " & Split(StrExcl, ",")(i) & " ", " ")
      Loop
    Next
    'Clean up any duplicate spaces
    Do While InStr(StrIn, "  ") > 0
      StrIn = Replace(StrIn, "  ", " ")
    Loop
    i = 0
    Do While UBound(Split(StrIn, " ")) > 1
      StrFnd = " " & Split(StrIn, " ")(1) & " ": j = Len(StrIn)
      'Find how many occurences of each word there are in the document
      StrIn = Replace(StrIn, StrFnd, " ")
      k = (j - Len(StrIn)) / (Len(StrFnd) - 1)
      'If there's more than one occurence, add the word to our Find list
      If k > 1 Then
        ReDim Preserve ArrOut(i)
        ArrOut(i) = Trim(StrFnd)
        i = i + 1
      End If
    Loop
    WordBasic.SortArray ArrOut()
    End Sub