Search code examples
htmlxmlxsltibm-watsonwatson-explorer

Avoid to crawling html portion using html-xsl parsing


I'm using Watson Explorer FC 11.0.2 and i'm trying to avoid some html tag from the Watson crawlers. At the time I'm using an xslt parser for extracting meta-data, title, and body from an html page with following Path:

"/html/body/div[@class='page-wrapper']/div[@id='main']/ul[@class='sidebar grid-25']"

The parser that im using is the following:

<xsl:template match="/">
<document>

<xsl:apply-templates match="h2[@class='entry-title']" />

<xsl:for-each select="html/head/meta">

<xsl:if test="@name != '' and @content != 'null'">
<content>
<xsl:attribute name="name">
<xsl:value-of select="@name" />
</xsl:attribute>
<xsl:value-of select="@content" />
</content>
</xsl:if>

</xsl:for-each>

<xsl:apply-templates match="div[@class='entry-content']" />

</document>

<xsl:apply-templates match="ul[@class='sidebar grid-25']" />


</xsl:template>

<xsl:template match="h2[@class='entry-title']">
<content name="title">
<xsl:value-of select="." />
</content>
</xsl:template>

<xsl:template match="div[@class='entry-content']">
<content name="snippet" weight="1" output-action="summarize" type="html">
<xsl:value-of select="." />
</content>
</xsl:template>


<xsl:template match="ul[@class='sidebar grid-25']">
<xsl:value-of select="." />
</xsl:template>

So, how can i handle this problem? I really don't know where i have to insert the "xsl apply templates" inside my parser for reach the goal.

Thanks in advance guys !


Solution

  • One of the way to clean-up HTML tags from body using XSLT is: using Tidy from org.w3c

    e.g.

    <xsl:template match="/">
        <document>
            <xsl:apply-templates match="h2[@class='entry-title']" />
    
            <xsl:for-each select="html/head/meta">
                <xsl:if test="@name != '' and @content != 'null'">
                    <content>
                        <xsl:attribute name="name">
                            <xsl:value-of select="@name" />
                        </xsl:attribute>
                        <xsl:value-of select="@content" />
                    </content>
                </xsl:if>
            </xsl:for-each>
    
            <xsl:apply-templates match="div[@class='entry-content']" />
        </document>
    
        <xsl:apply-templates select="ul[@class='sidebar grid-25']" />
    </xsl:template>
    
    <xsl:template match="h2[@class='entry-title']">
        <content name="title">
            <xsl:value-of select="htmlparser:parseHTMLtoDocument(.)" />
        </content>
    </xsl:template>
    
    <xsl:template match="div[@class='entry-content']">
        <content name="snippet" weight="1" output-action="summarize" type="html">
            <xsl:value-of select="htmlparser:parseHTMLtoDocument(.)" />
        </content>
    </xsl:template>
    
    <xsl:template match="ul[@class='sidebar grid-25']">
        <xsl:value-of select="htmlparser:parseHTMLtoDocument(.)" />
    </xsl:template>
    

    You can create a class named com.xyz.commons.xsl.HtmlDocumentParser like below and call it's method:

    public class HtmlDocumentParser {
    private static Logger log = Logger.getLogger(HtmlDocumentParser.class);
    private static Log4jPrintWriter log4j = new Log4jPrintWriter(log, Level.WARN);
    
    public static Document parseHTMLtoDocument(final String input) {
        return parseHTMLtoDocument(input, "UTF-8");
    }
    
    public static Document parseHTMLtoDocument(final String input, final String encoding) {
    
        final String htmlInput = String
                .format("<!DOCTYPE HTML><html>\n<head>\n<title>\n</title>\n</head>\n<body>\n%s</body></html>", input);
        Tidy tidy = new Tidy();
        tidy.setInputEncoding(encoding);
        tidy.setOutputEncoding(encoding);
        tidy.setXHTML(true);
        tidy.setXmlOut(true);
        tidy.setEncloseBlockText(true);
        tidy.setEncloseText(true);
        tidy.setMakeBare(true);
        tidy.setMakeClean(true);
        tidy.setWord2000(true);
        tidy.setDropFontTags(true);
        tidy.setQuiet(true);
        tidy.setErrout(log4j);
    
        Document doc = tidy.parseDOM(new ByteArrayInputStream(htmlInput.getBytes(Charset.forName(encoding))), null);
        return doc;
    }}