Search code examples
java-melwuit

HTML text extraction in j2me without any HTML tags and special characters


I have a string from an RSS file after parsing.

String htmlString=

<p><img border="1" align="left" width="200" vspace="2" hspace="2" height="133" alt="Prime Minister Manmohan Singh will leave for Iran on August   28, 2012 to attend the Non-Aligned Movement summit, which will   be preceded by crucial bilateral talks with Iran&rsquo;s supreme   leader Ayotollah Ali Khamenei and Iranian President Mahmoud   Ahmadinejad." src="/tmdbuserfiles/manmohan ahmadi(3).jpg" />Prime Minister summit, which will be preceded by crucial bilateral talks with Iran&rsquo;s supreme leader place at a time when the U.S. is pushing India to reduce engagement with Iran and implement sanctions imposed by some countries over its controversial nuclear programme.<br />
    <br />
    &nbsp;</p>

I have a requirement to display the text without any HTML tags and without HTML special characters from above htmlString on my LWUIT Form like:

Prime Minister ManmohanSingh will leave for Iran on August 28, 2012 to attend the Non-Aligned Movement summit, which will  
be preceded by crucial bilateral talks with Iran supreme leader Ayotollah Ali Khamenei and Iranian etc...........?

Solution

  • It also helps to open the HttpConnection input stream with UTF-8 encoding like this :

    String encoding = "UTF-8";
    Reader reader = new InputStreamReader(in, encoding);
    

    Use this suite of String Utils to get clean and well formatted text. :

     /**
     * Method removes HTML tags from given string.
     *
     * @param text  Input parameter containing HTML tags (eg. <b>cat</b>)
     * @return      String without HTML tags (eg. cat)
     */
    public static String removeHtml(String text) {
        try {
            int idx = text.indexOf("<");
            if (idx == -1) {
                text = decodeEntities(text);
                return text;
            }
    
            String plainText = "";
            String htmlText = text;
            int htmlStartIndex = htmlText.indexOf("<", 0);
            if (htmlStartIndex == -1) {
                return text;
            }
            htmlText = StringUtils.replace(htmlText, "</p>", "\r\n");
            htmlText = StringUtils.replace(htmlText, "<br/>", "\r\n");
            htmlText = StringUtils.replace(htmlText, "<br>", "\r\n");
            while (htmlStartIndex >= 0) {
                plainText += htmlText.substring(0, htmlStartIndex);
                int htmlEndIndex = htmlText.indexOf(">", htmlStartIndex);
                htmlText = htmlText.substring(htmlEndIndex + 1);
                htmlStartIndex = htmlText.indexOf("<", 0);
            }
            plainText = plainText.trim();
            plainText = decodeEntities(plainText);
            return plainText;
        } catch (Exception e) {
            System.err.println("Error while removing HTML: " + e.toString());
            return text;
        }
    }
    
    public static String decodeEntities(String html) {
        String result = StringUtils.replace(html, "&lt;", "<");
        result = StringUtils.replace(result, "&gt;", ">");
        result = StringUtils.replace(result, "&nbsp;", " ");
        result = StringUtils.replace(result, "&amp;", "&");
        result = StringUtils.replace(result, "&auml;", "ä");
        result = StringUtils.replace(result, "&ouml;", "ö");
        result = StringUtils.replace(result, "&quot;", "'");
        result = StringUtils.replace(result, "&lquot;", "'");
        result = StringUtils.replace(result, "&rquot;", "'");
        result = StringUtils.replace(result, "&#xd;", "\r");
        return result;
    }
    
    /* Replace all instances of a String in a String.
     *   @param  s  String to alter.
     *   @param  f  String to look for.
     *   @param  r  String to replace it with, or null to just remove it.
     */
    public static String replace(String s, String f, String r) {
        if (s == null) {
            return s;
        }
        if (f == null) {
            return s;
        }
        if (r == null) {
            r = "";
        }
        int index01 = s.indexOf(f);
        while (index01 != -1) {
            s = s.substring(0, index01) + r + s.substring(index01 + f.length());
            index01 += r.length();
            index01 = s.indexOf(f, index01);
        }
        return s;
    }
    
    public static String cleanEncodedString(String str) {
        String resultStr = str;
        String encoding = "UTF-8";
    
        InputStream in = new ByteArrayInputStream(str.getBytes());
        InputStreamReader isr;
        try {
            isr = new InputStreamReader(in, encoding);
    
            ByteArrayOutputStream buf = new ByteArrayOutputStream();
            int result = isr.read();
            while (result != -1) {
                byte b = (byte) result;
                buf.write(b);
                result = isr.read();
            }
            resultStr = buf.toString();
    
            return resultStr;
        } catch (Exception uee) {
            uee.printStackTrace();
        }
        return resultStr;
    }