Search code examples
htmlxmlxsltunicodepercent-encoding

How do I convert HTML percent-encoding to Unicode, with XSLT?


There are tons of entries and answers online about this, but they're all going the opposite direction of what I need. From my iTunes XML, I have thousands of percent-encoded entries, in multiple languages, that I'm trying to convert, with an XSLT stylesheet, to Unicode text. Is there any function or process that I'm missing, other than tracking down every single character and doing a replace? Here is a small sample of some examples of the variety that I'm working with, the first line is the XML string value, the following line is the basic text that I'm trying to generate, and output to a text file.

<string>/iTunes/iTunes%20Music/Droit%20devant/L'odysse%CC%81e.mp3</string>

/iTunes/iTunes Music/Droit devant/L'odyssée.mp3

<string>A%CC%80%20la%20Pe%CC%82che</string>

À la Pêche

<string>%D0%97%D0%B0%D0%BF%D0%BE%D0%BC%D0%B8%D0%BD%D0%B0%D0%B8%CC%86</string>

Запоминай

<string>%CE%9A%CE%BF%CC%81%CF%84%CF%83%CC%8C%CE%B1%CF%81%CE%B9</string>

Κότσ̌αρι

This last one may not display properly for some, because of the overstriking hacek/caron.

Thanks in advance for any advice or leads


Solution

  • A pure XSLT 2.0 solution could make use of the string-to-codepoints() and the codepoints-to-string() functions. The utf-8 decoding is a bit messy, it can be done.

    This XSLT 2.0 style-sheet...

    <xsl:stylesheet version="2.0"
      xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      xmlns:xs="http://www.w3.org/2001/XMLSchema"
      xmlns:so="http://stackoverflow.com/questions/13768754"
      exclude-result-prefixes="xsl xs so">
    <xsl:output encoding="UTF-8" omit-xml-declaration="yes" indent="yes" />
    <xsl:strip-space elements="*"/>
    
    <xsl:variable name="cp-base" select="string-to-codepoints('0A')" as="xs:integer+" />
    
    <xsl:template match="@*|node()">
      <xsl:copy>
        <xsl:apply-templates select="@*|node()" />
      </xsl:copy>
    </xsl:template>
    
    <xsl:function name="so:utf8decode" as="xs:integer*">
      <xsl:param name="bytes" as="xs:integer*" />
      <xsl:choose>
        <xsl:when test="empty($bytes)" />
        <xsl:when test="$bytes[1] eq 0"><!-- The null character is not valid for XML. -->
          <xsl:sequence select="so:utf8decode( remove( $bytes, 1))" />
        </xsl:when>
        <xsl:when test="$bytes[1] le 127">
          <xsl:sequence select="$bytes[1], so:utf8decode( remove( $bytes, 1))" />
        </xsl:when>
        <xsl:when test="$bytes[1] lt 224">
          <xsl:sequence select="
          ((($bytes[1] - 192) * 64) +
            ($bytes[2] - 128)        ),
            so:utf8decode( remove( remove( $bytes, 1), 1))" />
        </xsl:when>
        <xsl:when test="$bytes[1] lt 240">
          <xsl:sequence select="
          ((($bytes[1] - 224) * 4096) +
           (($bytes[2] - 128) *   64) +
            ($bytes[3] - 128)          ),
            so:utf8decode( remove( remove( remove( $bytes, 1), 1), 1))" />
        </xsl:when>
        <xsl:when test="$bytes[1] lt 248">
          <xsl:sequence select="
          ((($bytes[1] - 240) * 262144) +
           (($bytes[2] - 128) *   4096) +
           (($bytes[3] - 128) *     64) +
            ($bytes[4] - 128)            ),
            so:utf8decode( $bytes[position() gt 4])" />
        </xsl:when>
        <xsl:otherwise>
          <!-- Code-point valid for XML. -->
          <xsl:sequence select="so:utf8decode( remove( $bytes, 1))" />
        </xsl:otherwise>
      </xsl:choose>
    </xsl:function>
    
    <xsl:template match="string/text()">
      <xsl:analyze-string select="." regex="(%[0-9A-F]{{2}})+" flags="i">
        <xsl:matching-substring>
          <xsl:variable name="utf8-bytes" as="xs:integer+">
            <xsl:analyze-string select="." regex="%([0-9A-F]{{2}})" flags="i">
              <xsl:matching-substring>
              <xsl:variable name="nibble-pair" select="
                for $nibble-char in string-to-codepoints( upper-case(regex-group(1))) return
                  if ($nibble-char ge $cp-base[2]) then
                      $nibble-char - $cp-base[2] + 10
                    else
                      $nibble-char - $cp-base[1]" as="xs:integer+" />
                <xsl:sequence select="$nibble-pair[1] * 16 + $nibble-pair[2]" />                
              </xsl:matching-substring>
            </xsl:analyze-string>
          </xsl:variable>
          <xsl:value-of select="codepoints-to-string( so:utf8decode( $utf8-bytes))" />
        </xsl:matching-substring>
        <xsl:non-matching-substring>
          <xsl:value-of select="." />
        </xsl:non-matching-substring>
        <xsl:fallback>
          <!-- For XSLT 1.0 operating in forward compatibility mode,
               just echo -->
          <xsl:value-of select="." />
        </xsl:fallback>
      </xsl:analyze-string>
    </xsl:template>
    
    </xsl:stylesheet>
    

    ...applied to this input...

    <doc>
        <string>/iTunes/iTunes%20Music/Droit%20devant/L'odysse%CC%81e.mp3</string>
        <string>A%Cc%80%20la%20Pe%CC%82che</string>
        <string>%D0%97%D0%B0%D0%BF%D0%BE%D0%BC%D0%B8%D0%BD%D0%B0%D0%B8%CC%86</string>
        <string>%CE%9A%CE%BF%CC%81%CF%84%CF%83%CC%8C%CE%B1%CF%81%CE%B9</string>
    </doc>
    

    ..yields..

    <doc>
       <string>/iTunes/iTunes Music/Droit devant/L'odyssée.mp3</string>
       <string>À la Pêche</string>
       <string>Запоминай</string>
       <string>Κότσ̌αρι</string>
    </doc>