Search code examples
screen-scrapingjsoup

How to replace words with span tag using jsoup?


Assume I have the following html:

<html>
<head>
</head>
<body>
    <div id="wrapper" >
         <div class="s2">I am going <a title="some title" href="">by flying</a>
           <p>mr tt</p>
         </div> 
    </div>
</body>    
</html>

Any words in the text nodes that are equal to or greater than 4 characters for example the word 'going' is replaced with html content (not text) <span>going<span> in the original html without changing anything else.

If I try do something like element.html(replacement), the problem is if lets the current element is <div class="s2"> it will also wipe off <a title="some title"


Solution

  • In this case you must traverse your document as suggested by this answer. Here's a way of doing it using Jsoup APIs:

    • NodeTraversor and NodeVisitor allow you to traverse the DOM
    • Node.replaceWith(...) allows for replacing a node in the DOM

    Here's the code:

    public class JsoupReplacer {
    
      public static void main(String[] args) {
        so6527876();
      }
    
      public static void so6527876() {
        String html = 
        "<html>" +
        "<head>" +
        "</head>" +
        "<body>" +
        "    <div id=\"wrapper\" >" +
        "         <div class=\"s2\">I am going <a title=\"some title\" href=\"\">by flying</a>" +
        "           <p>mr tt</p>" +
        "         </div> " +
        "    </div>" +
        "</body>    " +
        "</html>";
        Document doc = Jsoup.parse(html);
    
        final List<TextNode> nodesToChange = new ArrayList<TextNode>();
    
        NodeTraversor nd  = new NodeTraversor(new NodeVisitor() {
    
          @Override
          public void tail(Node node, int depth) {
            if (node instanceof TextNode) {
              TextNode textNode = (TextNode) node;
              String text = textNode.getWholeText();
              String[] words = text.trim().split(" ");
              for (String word : words) {
                if (word.length() > 4) {
                  nodesToChange.add(textNode);
                  break;
                }
              }
            }
          }
    
          @Override
          public void head(Node node, int depth) {        
          }
        });
    
        nd.traverse(doc.body());
    
        for (TextNode textNode : nodesToChange) {
          Node newNode = buildElementForText(textNode);
          textNode.replaceWith(newNode);
        }
    
        System.out.println("result: ");
        System.out.println();
        System.out.println(doc);
      }
    
      private static Node buildElementForText(TextNode textNode) {
        String text = textNode.getWholeText();
        String[] words = text.trim().split(" ");
        Set<String> longWords = new HashSet<String>();
        for (String word : words) {
          if (word.length() > 4) {
            longWords.add(word);
          } 
        }
        String newText = text;
        for (String longWord : longWords) {
          newText = newText.replaceAll(longWord, 
              "<span>" + longWord + "</span>");
        }
        return new DataNode(newText, textNode.baseUri());
      }
    
    }