htmlvbams-word

Converting text with HTML tags to formatted text in Word using VBA


Boxes with question mark from page breaks I want to modify text in Word with HTML tags to change it to Word-recognized format.

I am looking to modify the code below to include the following:

  1. Format everything to a specific size and font (Calibri size 11 for example).
  2. Recognize line breaks tags (<br>) to swap with an actual line break.
  3. Not just recognize bold or underline, but instances where both are used simultaneously.
Sub ReformatHTML()
    Application.ScreenUpdating = False
    With ActiveDocument.Range.Find
        .ClearFormatting
        .Format = True
        .Forward = True
        .MatchWildcards = True
        .Wrap = wdFindContinue
        ' Remove <p>
        .Replacement.Text = "\2"
        .Replacement.ClearFormatting
        .Text = "\<(p\>)(*)\</\1"
        .Execute Replace:=wdReplaceAll
        .Replacement.Text = "\2"
        .Replacement.ClearFormatting
        .Text = "\<(u\>)(*)\</\1"
        .Replacement.Font.Underline = True
        .Replacement.Font.Name = "Calibri"
        .Replacement.Font.Size = 11
        .Execute Replace:=wdReplaceAll
        .Replacement.ClearFormatting
        .Text = "\<(strong\>)(*)\</\1"
        .Replacement.Style = "Strong"
        .Replacement.Font.Name = "Calibri"
        .Replacement.Font.Size = 11
        .Execute Replace:=wdReplaceAll
        .Replacement.ClearFormatting
        .Text = "\<(i\>)(*)\</\1"
        .Replacement.Font.Italic = True
        .Replacement.Font.Name = "Calibri"
        .Replacement.Font.Size = 11
        .Execute Replace:=wdReplaceAll
        .Replacement.ClearFormatting
        .Text = "\<(h\>)(*)\</\1"
        .Replacement.Highlight = True
        .Replacement.Font.Name = "Calibri"
        .Replacement.Font.Size = 11
        .Execute Replace:=wdReplaceAll
        ' line break
        .Replacement.Text = vbCrLf
        .Replacement.ClearFormatting
        .Text = "\<br\>"
        .Execute Replace:=wdReplaceAll
    End With
    
    With ActiveDocument.Range
        .Font.Name = "Calibri"
        .Font.Size = 11
    End With
    
    Application.ScreenUpdating = True
End Sub

The code works, aside from the modifications. Post titled Rendering text with HTML tags to Formatted text in a Word table using VBA has helped.

Here is a sample of text I am trying to modify (I can't figure out how to do it with text and not have the HTML converted to formatting):
enter image description here


Solution

  • Pleas try.

    Sub ReformatHTML()
        Application.ScreenUpdating = False
        With ActiveDocument.Range.Find
          .ClearFormatting
          .Format = True
          .Forward = True
          .MatchWildcards = True
          .Wrap = wdFindContinue
          ' replace <u>
          .Replacement.Text = "\2"
          .Replacement.ClearFormatting
          ' set font style
          .Replacement.Font.Name = "Calibri"
          .Replacement.Font.Size = 11
          .Text = "\<(u\>)(*)\</\1"
          .Replacement.Font.Underline = True
          .Execute Replace:=wdReplaceAll
          ' line break
          .Replacement.Text = vbCr
          .Replacement.ClearFormatting
          .Text = "\<br\>"
          .Execute Replace:=wdReplaceAll
        End With
        Application.ScreenUpdating = True
    End Sub
    

    enter image description here


    Update

    remove tag <p> and integret with code on OP

    Sub ReformatHTML()
        Application.ScreenUpdating = False
        With ActiveDocument.Range.Find
            .ClearFormatting
            .Format = True
            .Forward = True
            .MatchWildcards = True
            .Wrap = wdFindContinue
            ' Remove <p>
            .Replacement.Text = "\2"
            .Replacement.ClearFormatting
            .Text = "\<(p\>)(*)\</\1"
            .Execute Replace:=wdReplaceAll
            .Replacement.Text = "\2"
            .Replacement.ClearFormatting
            .Text = "\<(u\>)(*)\</\1"
            .Replacement.Font.Underline = True
            .Replacement.Font.Name = "Calibri"
            .Replacement.Font.Size = 11
            .Execute Replace:=wdReplaceAll
            .Replacement.ClearFormatting
            .Text = "\<(strong\>)(*)\</\1"
            .Replacement.Style = "Strong"
            .Replacement.Font.Name = "Calibri"
            .Replacement.Font.Size = 11
            .Execute Replace:=wdReplaceAll
            .Replacement.ClearFormatting
            .Text = "\<(i\>)(*)\</\1"
            .Replacement.Font.Italic = True
            .Replacement.Font.Name = "Calibri"
            .Replacement.Font.Size = 11
            .Execute Replace:=wdReplaceAll
            .Replacement.ClearFormatting
            .Text = "\<(h\>)(*)\</\1"
            .Replacement.Highlight = True
            .Replacement.Font.Name = "Calibri"
            .Replacement.Font.Size = 11
            .Execute Replace:=wdReplaceAll
            ' line break
            .Replacement.Text = vbCrLf
            .Replacement.ClearFormatting
            .Text = "\<br\>"
            .Execute Replace:=wdReplaceAll
        End With
        Application.ScreenUpdating = True
    End Sub
    

    Set font style for whole document, then .Replacement.Font.Name and .Replacement.Font.Size are not necessary.

        With ActiveDocument.Range.Font
            .Name = "Calibri"
            .Size = 11
        End With