Search code examples
javascriptregextypescript

extract content from a xml file


I have xml content as follows

<Artificial name="Artifical name">
    <Machine>
        <MachineEnvironment uri="environment" />
    </Machine>
    <Mobile>taken phone, test

when r1
    100m SUV
then
    FireFly is High
end


when r2
    Order of the Phonenix 
    
then
    Magic is High
end


</Mobile>
</Artificial>

I want to write a function that accepts a line (string) and content (string) and returns the content of the closest tag which the provided line belongs to.

For example, if i provide the line FireFly is High it should return the following as it is the closest tag the provided line belongs to.

<Mobile>taken phone, test

when r1
    100m SUV
then
    FireFly is High
end


when r2
    Order of the Phonenix 

then
    Magic is High
end


</Mobile>

Following is my code

getLineContent(line: string, content: string) {
    const trimmedLine = line.trim()
    const isSelfClosingTag = /\/\s*>$/.test(trimmedLine)
    const isPlainTextLine = !/<|>/.test(trimmedLine)
    const regex = new RegExp(`(${trimmedLine}[^>]*>)([\\s\\S]*?)</(${trimmedLine.split(' ')[0].substr(1)}>)`)
    const isClosingTag = /^<\/\w+>$/.test(trimmedLine)
    const match = content.match(regex)

    if (!isClosingTag) {
      if (isSelfClosingTag) {
        return trimmedLine
      }

      if (match && match[2]) {
        return match[1] + match[2] + match[3]
      }
      if (isPlainTextLine) {
        const regex = new RegExp(`(<[^>]*>)([\\s\\S]*?${trimmedLine.split(' ')[0].substr(1)}[\\s\\S]*?</[a-zA-Z]+>)`)
        const match = content.match(regex)
        console.log('isPlainTextLine', match)
        if (match && match[1] && match[2]) {
          return match[2]
        }
      }
      return trimmedLine
    }
  }

It works almost perfectly but not quite. The problem lies in this part of the code

if (isPlainTextLine) {
        const regex = new RegExp(`(<[^>]*>)([\\s\\S]*?${trimmedLine.split(' ')[0].substr(1)}[\\s\\S]*?</[a-zA-Z]+>)`)
        const match = content.match(regex)
        console.log('isPlainTextLine', match)
        if (match && match[1] && match[2]) {
          return match[2]
        }
      }

For ex: If i provide FireFly is High , then the returned value is

<Machine>
        <MachineEnvironment uri="environment" />
    </Machine>
    <Mobile>taken phone, test

when r1
    100m SUV
then
    FireFly is High
end


when r2
    Order of the Phonenix 

then
    Magic is High
end


</Mobile>

Regex is not my strong suit. Any help is appreciated.


Solution

  • Regex isn't the right tool for this task. Instead use an XML parser for this. There are many to choose from. For instance, you could use fast-xml-parser. It converts XML to a nested object structure. Demo:

    const { XMLParser } = require("fast-xml-parser");
    
    function findText(obj, find, key="") {
        if (typeof obj === "string" && obj.includes(find)) {
            return { [key]: obj };
        }
        if (Object(obj) === obj) {
            for (const key in obj) {
                const result = findText(obj[key], find, key);
                if (result) return result;
           }
        }
    }
    
    const xml = `<Artificial name="Artifical name">
        <Machine>
            <MachineEnvironment uri="environment" />
        <\/Machine>
        <Mobile>taken phone, test
        ...
        FireFly is High
        ...
        </Mobile>
    <\/Artificial>`;
    
    const obj = new XMLParser().parse(xml);
    const result = findText(obj, "FireFly");
    console.log(result); // { Mobile: "taken phone, ....... " }
    

    As a second example, in a browser context, you can use DOMParser from the WebAPI:

    function *iterNodes(doc, whatToShow) { // Generator for createTreeWalker
        const walk = doc.createTreeWalker(doc.documentElement, whatToShow, null, false);
        for (let node; node = walk.nextNode(); null) yield node;
    }
    
    function findTagByContent(xml, content) {
        const doc = new DOMParser().parseFromString(xml, "text/xml");
        for (const node of iterNodes(doc, NodeFilter.SHOW_TEXT)) {
            if (node.textContent.includes(content)) return node.parentNode.outerHTML;
        }
    }
    
    // Example run
    
    const xml = `<Artificial name="Artifical name">
        <Machine>
            <MachineEnvironment uri="environment" />
        </Machine>
        <Mobile>taken phone, test
        ...
        FireFly is High
        ...
        </Mobile>
    </Artificial>`;
    
    console.log(findTagByContent(xml, "FireFly"));