Search code examples
htmljsoup

JSoup query: how to find all instances of an HTML tag in another tag


I have an HTML page to scan for headings and figure elements. The code of the HTML file is:

    <body>
    <section>
        <h2>H2-1</h2>
        <div>
            <section>
                <h3>H3-1</h3>
                <div>
                    <figure><img src="Fig 1.png"></figure>
                </div>
            </section>
            <section>
                <h3>H3-2</h3>
                <div></div>
            </section>
        </div>
    </section>
    <section>
        <h2>H2-2</h2>
        <div>
            <figure><img src="Fig 2.png"></figure>
            <figure><img src="Fig 3.png"></figure>
            <figure><img src="Fig 4.png"></figure>
        </div>
    </section>
    <section>
        <h2>H2-3</h2>
        <div>
            <figure><img src="Fig 5.png"></figure>
            <section>
                <h3>H3-3</h3>
                <div><figure><img src="Fig 6.png"></figure></div>
            </section>
        </div>
    </section>
    <section>
        <h2>H2-4</h2>
        <div>
        </div>
    </section>
    </body>

The output I want is:

H2-1
  H3-1
    Fig 1
H2-2
  Fig 2
  Fig 3
  Fig 4
H2-3
  Fig 5
  H3-3
    Fig 6

I want to find show all headings that has a child figure element. I do not want to display the heading if it does to have an immediate child figure element.

The JSoup code I am trying is:

    Document doc = Jsoup.parse(sourceCode);
    Elements sectionTags = doc.body().getElementsByTag("section");
    for (Element sectTag : sectionTags)
    {
      System.out.println (sectTag.children().first().ownText());  //print the Heading text 
      Elements figureTags = sectTag.getElementsByTag("figure");
      for (Element figTag : figureTags) 
      {
        System.out.println (figTag.getElementsByTag("img").attr("src").toString());  // print the image name
      }                          
    }

but I am not getting the desired output. The output that I get is:

H2-1
  Fig 1
  H3-1
    Fig 1
H2-2
  Fig 2
H2-3
  Fig 5
  H3-3
    Fig 6

Any help? I am new to JSoup and appreciate any suggestion or tip that can work.

Thanks in advance.


Solution

  • You can try this

    /*function to parse the HTML*/
    private void parseHTML(String szHTML){
        Document doc = Jsoup.parse(szHTML);
    
        Elements arrEle = doc.select("section");
    
        for(Element child: arrEle){
            printSectionDetails(child);
        }
    }
    
    /*function to print section details : prints direct child header name and img src of the section node */
    private void printSectionDetails(Element section){
        Elements arrChildren = section.children();
        for(Element child : arrChildren){
            if(isHeader(child.tagName())){
                System.out.println(child.text());
            }
    
            if(child.tagName().equals("div")){
                Elements children = child.children();
    
                for(Element grandchild : children){
                    if(grandchild.tagName().equals("figure")){
                        Elements arrImgs = grandchild.select("img");
                        for(Element img : arrImgs){
                            System.out.println(img.attr("src"));
                        }
                    }
                }
            }
        }
    }
    
    /*checks whether the tag is a header*/
    private boolean isHeader(String tagName) {
        if("h1".equalsIgnoreCase(tagName) || "h2".equalsIgnoreCase(tagName) || "h3".equalsIgnoreCase(tagName) || "h4".equalsIgnoreCase(tagName) || "h5".equalsIgnoreCase(tagName) || "h1".equalsIgnoreCase(tagName) || "h6".equalsIgnoreCase(tagName)){
            return true;
        }
    
        return false;
    }