Search code examples
xmlvbanotepad++xmldom

Find multiple lines in TXT/xml file and remove if criteria met


Been wondering if it would be possible to make a simple script to check if multiple criteria are met and make necessary amendments to file.

Moving on to example of what I have and what I want to achieve.

I have an xml file with 4 lines - number, year, model and man.

If <man> is Ford or Dodge, I want no amendments to be made. But if <man> is anything other than that, then I want to check if <year> or <model> are "NA" and remove the line with "NA".

<?xml version="1.0" encoding="UTF-8"?>
<CarStuff>
    <fileName>CarExpor201217.xml</fileName>
    <numberCars>5</numberCars>
    <ref>2017XY</ref>
    <carExo id="CAR0001_01">
        <dealVen id="CAR0001_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0001_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>NA</year>             - Line must be removed
            <model>NA</model>           - Line must be removed
            <man>Acura</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0002_01">
        <dealVen id="CAR0002_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0002_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>NA</year>         - Line must be kept
            <model>NA</model>       - Line must be kept
            <man>Ford</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0003_01">
        <dealVen id="CAR0003_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0003_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>1997</year>       - Line must be kept
            <model>NA</model>       - Line must be removed
            <man>Bugati</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0004_01">
        <dealVen id="CAR0004_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0004_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>1997</year>       - Line must be kept
            <model>NA</model>       - Line must be kept
            <man>Dodge</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0005_01">
        <dealVen id="CAR0005_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0005_03">
            <amount>1811.10</amount>
            <lotNumber>2</lotNumber>
            <year>NA</year>         - Line must be kept
            <model>Charger</model>  - Line must be kept
            <man>Dodge</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0005_01">
        <dealVen id="CAR0005_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0005_03">
            <amount>1811.10</amount>
            <lotNumber>3</lotNumber>
            <year>NA</year>         - Line must be removed
            <model>Dot</model>      - Line must be kept
            <man>Datsun</man>
        </soldCar>
    </carExo>
</CarStuff>

Grateful for all the comments and ideas.


Solution

  • Solution via XMLDom

    You can use XMLDom and XPath to search in a so called NodeList for <man> tags not containing Dodge or Ford strings and check all siblings if they contain "NA" in order to delete them. The code below uses late binding. BTW, your xml in OP wasn't well formed (closing tag </carStuf> instead of </carStuff> - I added a little parse error routine to check this when loading.

    Code

    Option Explicit
    
    Sub checkNA()
    Dim xDoc        As Object    ' xml document
    Dim noli, noli2 As Object    ' node list
    Dim no, no2     As Object    ' node
    Dim noMan       As Object    ' node <man> to check if no Dodge or Ford
    Dim s           As String
    Dim sFile       As String    ' xml file name
    
      sFile = ThisWorkbook.Path & "\xml\na_test.xml"  ' <<< change to your xml file name
    
    ' late binding xml
      Set xDoc = CreateObject("MSXML2.DOMDocument.6.0")
      xDoc.async = False: xDoc.validateOnParse = False
      xDoc.setProperty "SelectionLanguage", "XPath"
    ' load xml
      If xDoc.Load(sFile) Then
        Debug.Print "Loaded successfully"
      Else
        Dim xPE        As Object    ' Set xPE = CreateObject("MSXML2.IXMLDOMParseError")
        Dim strErrText As String
        Set xPE = xDoc.parseError
        With xPE
        strErrText = "Load error " & .ErrorCode & " xml file " & vbCrLf & _
                     Replace(.URL, "file:///", "") & vbCrLf & vbCrLf & _
                     xPE.reason & _
                     "Source Text: " & .srcText & vbCrLf & vbCrLf & _
                     "Line No.:    " & .Line & vbCrLf & _
                     "Line Pos.: " & .linepos & vbCrLf & _
                    "File Pos.:  " & .filepos & vbCrLf & vbCrLf
        End With
        MsgBox strErrText, vbExclamation
        Set xPE = Nothing
        Exit Sub
      End If
    
    ' check items
      s = "carExo/soldCar"
      Set noli = xDoc.DocumentElement.SelectNodes(s)
      For Each no In noli
          Set noMan = no.SelectSingleNode("man")
          If Not noMan Is Nothing Then
             If InStr("Ford.Dodge" & ".", noMan.Text & ".") = 0 Then
                Debug.Print "delete", noMan.Text
                ' delete all subtags containing "NA" as text
                Set noli2 = no.SelectNodes("*")
                For Each no2 In noli2
                    If no2.Text = "NA" Then
                       ' delete item
                         Debug.Print , no2.nodename & "=" & no2.Text
                         no2.ParentNode.RemoveChild no2
                    End If
                Next no2
    
             Else
                ' Debug.Print "keep", noman.Text
             End If
          End If
      Next no
    
    ' save
      ' Debug.Print xDoc.XML
      xDoc.Save sFile      
    ' close
      Set xDoc = Nothing
    End Sub
    

    Edit 12/29 - Addendum

    I added a second workable version of the ' check items part using some extra XPath. This alternative simply avoids two If conditions in normal code as it narrows the range of found nodes in the two node lists.

    ' check items
      s = "carExo/soldCar[man!='Ford'][man!='Dodge']"   ' << (1) added condition to XPath
      Set noli = xDoc.DocumentElement.SelectNodes(s)
      For Each no In noli
          Set noMan = no.SelectSingleNode("man")
          If Not noMan Is Nothing Then
             Debug.Print "delete", noMan.Text
           ' delete all subtags containing "NA" as text
             Set noli2 = no.SelectNodes("*[.='NA']")    ' << (2)added condition to XPath
             For Each no2 In noli2
               ' delete item
                 Debug.Print , no2.nodename & "=" & no2.Text
                 no2.ParentNode.RemoveChild no2
             Next no2
          End If
      Next no
    

    Hint

    Of course there are many streets leading to Rome, see @Parfait 's XSLT approach below.