Search code examples
javascriptphphtmlxmlentities

Is there some function I can use in PHP or javascript to convert ALL HTML entities to their decimal equivalents?


Question is in the title. I am using jQuery's XML parsing capability to process XML which routinely contains special HTML characters in the format  , which routinely breaks my application as jQuery will not recognise it as valid XML.

To avoid this at the moment I just process the XML using PHP before it is passed to the client side - here is a fragment of my code:

$fixedmessage = str_replace('Â', 'Â', htmlentities($MessageText[$j], ENT_COMPAT, "UTF-8" ));
$fixedmessage = str_replace('£', '£', $fixedmessage);
$fixedmessage = str_replace('Ã', 'Ã', $fixedmessage);
$fixedmessage = str_replace('¡', '¡', $fixedmessage);
$fixedmessage = str_replace('á', 'á', $fixedmessage);
$fixedmessage = str_replace('í', 'í', $fixedmessage);
...

As PHP's htmlentities function seems to be next to useless for all but the absolute basics, I simply run a manual replace on each special character as it becomes an issue, but this is not particularly elegant or, I would imagine, a particularly efficient way of doing things. Is there a better way?


Solution

  • How about decoding them first and then re-encoding in XML mode:

    htmlentities(
        html_entity_decode($str),
        ENT_XML1);
    

    And here's a straightforward solution:

    function decode_named_entities($str) {
        static $entities = array( "Aacute"=>"00C1", "aacute"=>"00E1", "Acirc"=>"00C2", "acirc"=>"00E2",
        "acute"=>"00B4", "AElig"=>"00C6", "aelig"=>"00E6", "Agrave"=>"00C0", "agrave"=>"00E0",
        "alefsym"=>"2135", "Alpha"=>"0391", "alpha"=>"03B1", "amp"=>"0026", "and"=>"2227", "ang"=>"2220",
        "apos"=>"0027", "Aring"=>"00C5", "aring"=>"00E5", "asymp"=>"2248", "Atilde"=>"00C3",
        "atilde"=>"00E3", "Auml"=>"00C4", "auml"=>"00E4", "bdquo"=>"201E", "Beta"=>"0392", "beta"=>"03B2",
        "brvbar"=>"00A6", "bull"=>"2022", "cap"=>"2229", "Ccedil"=>"00C7", "ccedil"=>"00E7",
        "cedil"=>"00B8", "cent"=>"00A2", "Chi"=>"03A7", "chi"=>"03C7", "circ"=>"02C6", "clubs"=>"2663",
        "cong"=>"2245", "copy"=>"00A9", "crarr"=>"21B5", "cup"=>"222A", "curren"=>"00A4", "dagger"=>"2020",
        "Dagger"=>"2021", "darr"=>"2193", "dArr"=>"21D3", "deg"=>"00B0", "Delta"=>"0394", "delta"=>"03B4",
        "diams"=>"2666", "divide"=>"00F7", "Eacute"=>"00C9", "eacute"=>"00E9", "Ecirc"=>"00CA",
        "ecirc"=>"00EA", "Egrave"=>"00C8", "egrave"=>"00E8", "empty"=>"2205", "emsp"=>"2003",
        "ensp"=>"2002", "Epsilon"=>"0395", "epsilon"=>"03B5", "equiv"=>"2261", "Eta"=>"0397",
        "eta"=>"03B7", "ETH"=>"00D0", "eth"=>"00F0", "Euml"=>"00CB", "euml"=>"00EB", "euro"=>"20AC",
        "exist"=>"2203", "fnof"=>"0192", "forall"=>"2200", "frac12"=>"00BD", "frac14"=>"00BC",
        "frac34"=>"00BE", "frasl"=>"2044", "Gamma"=>"0393", "gamma"=>"03B3", "ge"=>"2265", "gt"=>"003E",
        "harr"=>"2194", "hArr"=>"21D4", "hearts"=>"2665", "hellip"=>"2026", "Iacute"=>"00CD",
        "iacute"=>"00ED", "Icirc"=>"00CE", "icirc"=>"00EE", "iexcl"=>"00A1", "Igrave"=>"00CC",
        "igrave"=>"00EC", "image"=>"2111", "infin"=>"221E", "int"=>"222B", "Iota"=>"0399", "iota"=>"03B9",
        "iquest"=>"00BF", "isin"=>"2208", "Iuml"=>"00CF", "iuml"=>"00EF", "Kappa"=>"039A", "kappa"=>"03BA",
        "Lambda"=>"039B", "lambda"=>"03BB", "lang"=>"2329", "laquo"=>"00AB", "larr"=>"2190",
        "lArr"=>"21D0", "lceil"=>"2308", "ldquo"=>"201C", "le"=>"2264", "lfloor"=>"230A", "lowast"=>"2217",
        "loz"=>"25CA", "lrm"=>"200E", "lsaquo"=>"2039", "lsquo"=>"2018", "lt"=>"003C", "macr"=>"00AF",
        "mdash"=>"2014", "micro"=>"00B5", "middot"=>"00B7", "minus"=>"2212", "Mu"=>"039C", "mu"=>"03BC",
        "nabla"=>"2207", "nbsp"=>"00A0", "ndash"=>"2013", "ne"=>"2260", "ni"=>"220B", "not"=>"00AC",
        "notin"=>"2209", "nsub"=>"2284", "Ntilde"=>"00D1", "ntilde"=>"00F1", "Nu"=>"039D", "nu"=>"03BD",
        "Oacute"=>"00D3", "oacute"=>"00F3", "Ocirc"=>"00D4", "ocirc"=>"00F4", "OElig"=>"0152",
        "oelig"=>"0153", "Ograve"=>"00D2", "ograve"=>"00F2", "oline"=>"203E", "Omega"=>"03A9",
        "omega"=>"03C9", "Omicron"=>"039F", "omicron"=>"03BF", "oplus"=>"2295", "or"=>"2228",
        "ordf"=>"00AA", "ordm"=>"00BA", "Oslash"=>"00D8", "oslash"=>"00F8", "Otilde"=>"00D5",
        "otilde"=>"00F5", "otimes"=>"2297", "Ouml"=>"00D6", "ouml"=>"00F6", "para"=>"00B6", "part"=>"2202",
        "permil"=>"2030", "perp"=>"22A5", "Phi"=>"03A6", "phi"=>"03C6", "Pi"=>"03A0", "pi"=>"03C0",
        "piv"=>"03D6", "plusmn"=>"00B1", "pound"=>"00A3", "prime"=>"2032", "Prime"=>"2033", "prod"=>"220F",
        "prop"=>"221D", "Psi"=>"03A8", "psi"=>"03C8", "quot"=>"0022", "radic"=>"221A", "rang"=>"232A",
        "raquo"=>"00BB", "rarr"=>"2192", "rArr"=>"21D2", "rceil"=>"2309", "rdquo"=>"201D", "real"=>"211C",
        "reg"=>"00AE", "rfloor"=>"230B", "Rho"=>"03A1", "rho"=>"03C1", "rlm"=>"200F", "rsaquo"=>"203A",
        "rsquo"=>"2019", "sbquo"=>"201A", "Scaron"=>"0160", "scaron"=>"0161", "sdot"=>"22C5",
        "sect"=>"00A7", "shy"=>"00AD", "Sigma"=>"03A3", "sigma"=>"03C3", "sigmaf"=>"03C2", "sim"=>"223C",
        "spades"=>"2660", "sub"=>"2282", "sube"=>"2286", "sum"=>"2211", "sup"=>"2283", "sup1"=>"00B9",
        "sup2"=>"00B2", "sup3"=>"00B3", "supe"=>"2287", "szlig"=>"00DF", "Tau"=>"03A4", "tau"=>"03C4",
        "there4"=>"2234", "Theta"=>"0398", "theta"=>"03B8", "thetasym"=>"03D1", "thinsp"=>"2009",
        "THORN"=>"00DE", "thorn"=>"00FE", "tilde"=>"02DC", "times"=>"00D7", "trade"=>"2122",
        "Uacute"=>"00DA", "uacute"=>"00FA", "uarr"=>"2191", "uArr"=>"21D1", "Ucirc"=>"00DB",
        "ucirc"=>"00FB", "Ugrave"=>"00D9", "ugrave"=>"00F9", "uml"=>"00A8", "upsih"=>"03D2",
        "Upsilon"=>"03A5", "upsilon"=>"03C5", "Uuml"=>"00DC", "uuml"=>"00FC", "weierp"=>"2118",
        "Xi"=>"039E", "xi"=>"03BE", "Yacute"=>"00DD", "yacute"=>"00FD", "yen"=>"00A5", "yuml"=>"00FF",
        "Yuml"=>"0178", "Zeta"=>"0396", "zeta"=>"03B6", "zwj"=>"200D", "zwnj"=>"200C");
    
        return preg_replace_callback('~&([A-Za-z]+);~',
            function($m) use($entities) {
                $e = $m[1];
                return isset($entities[$e]) ? "&#x{$entities[$e]};" : "&$e;";
            },
            $str
        );
    }
    

    A faster way would be to generate two arrays from the above:

    $search = ["Á", "á" etc
    $replac = ["Á", "á" etc
    

    and apply str_replace.