Search code examples
pythonhtmlhtml-entities

Convert HTML entities to Unicode and vice versa


How do you convert HTML entities to Unicode and vice versa in Python?


Solution

  • You need to have BeautifulSoup.

    from BeautifulSoup import BeautifulStoneSoup
    import cgi
    
    def HTMLEntitiesToUnicode(text):
        """Converts HTML entities to unicode.  For example '&' becomes '&'."""
        text = unicode(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
        return text
    
    def unicodeToHTMLEntities(text):
        """Converts unicode to HTML entities.  For example '&' becomes '&'."""
        text = cgi.escape(text).encode('ascii', 'xmlcharrefreplace')
        return text
    
    text = "&, ®, <, >, ¢, £, ¥, €, §, ©"
    
    uni = HTMLEntitiesToUnicode(text)
    htmlent = unicodeToHTMLEntities(uni)
    
    print uni
    print htmlent
    # &, ®, <, >, ¢, £, ¥, €, §, ©
    # &amp;, &#174;, &lt;, &gt;, &#162;, &#163;, &#165;, &#8364;, &#167;, &#169;