Search code examples
javascripthtmlstringtextextract

Extract the text out of HTML string using JavaScript


I am trying to get the inner text of HTML string, using a JS function(the string is passed as an argument). Here is the code:

function extractContent(value) {
  var content_holder = "";

  for (var i = 0; i < value.length; i++) {
    if (value.charAt(i) === '>') {
      continue;
      while (value.charAt(i) != '<') {
        content_holder += value.charAt(i);
      }
    }

  }
  console.log(content_holder);
}

extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>");

The problem is that nothing gets printed on the console(*content_holder* stays empty). I think the problem is caused by the === operator.


Solution

  • Create an element, store the HTML in it, and get its textContent:

    function extractContent(s) {
      var span = document.createElement('span');
      span.innerHTML = s;
      return span.textContent || span.innerText;
    };
        
    alert(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"));


    Here's a version that allows you to have spaces between nodes, although you'd probably want that for block-level elements only:

    function extractContent(s, space) {
      var span= document.createElement('span');
      span.innerHTML= s;
      if(space) {
        var children= span.querySelectorAll('*');
        for(var i = 0 ; i < children.length ; i++) {
          if(children[i].textContent)
            children[i].textContent+= ' ';
          else
            children[i].innerText+= ' ';
        }
      }
      return [span.textContent || span.innerText].toString().replace(/ +/g,' ');
    };
        
    console.log(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>.  Nice to <em>see</em><strong><em>you!</em></strong>"));
    
    console.log(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>.  Nice to <em>see</em><strong><em>you!</em></strong>",true));