Search code examples
phpfileglob

PHP glob directory UTF-8


I am trying to get all files within a variable path ... atm I am using glob(), but I just can not find a way to open a path which contains ä ü ö in it...

I have tried a lot of de/encoding, but none seem to work... When I change the ä ü ö in the filename and the variable I get the right answer... so it has to be smth with utf8 chars...I guess

PHP script

// Get the variable via an Ajax post

$town =$_POST['town'];       // For example "Bankdrücken"

$pfad = 'maps/'. $country.'/'. $town .'/*.jpg';

$files = glob($pfad);
$n_files = count($files);

    $erg = new stdClass();
    $erg->files = $files;
    $erg->n_files = $n_files;

    echo json_encode($erg);
    return;

JavaScript

.success(function(data){
    console.log(data);
...

This will not return .../Bankdrücken, but .../Bankdr\u00fccken

When I do jQuery.parseJson(data); it returns ./Bankdrücken, but as this is on the client side the server side already did fail .. I just can't get it right.

Here is a real example ... if the $variable within the file path contains ä ö ü -> glob() will not get me any files... // No, I am printing out what I get back from the Ajax call...

Trial One (with ü):

"stdClass Object ( [files] => Array ( )

[n_files] => 0
[filename] => workout_uebungen/Brust/Liegestützen/

) " /******************************************/ Trail two (I changed the file name from Liegestützen => Liegestuetzen and the $var from Liegestützen => Liegestuetzen)

"stdClass Object ( [files] => Array ( [0] => workout_uebungen/Brust/Liegestuetzen/1.jpg [1] => workout_uebungen/Brust/Liegestuetzen/2.jpg )

[n_files] => 2
[filename] => workout_uebungen/Brust/Liegestuetzen/

) "

So trial 2 gives the right answer, but I don't want to change all my files ... as well it would be a dirty solution to me...


Solution

  • As case you can use:

    <?php
      class Encoding {
    
        protected static $win1252ToUtf8 = array(
          128 => "\xe2\x82\xac",
    
          130 => "\xe2\x80\x9a",
          131 => "\xc6\x92",
          132 => "\xe2\x80\x9e",
          133 => "\xe2\x80\xa6",
          134 => "\xe2\x80\xa0",
          135 => "\xe2\x80\xa1",
          136 => "\xcb\x86",
          137 => "\xe2\x80\xb0",
          138 => "\xc5\xa0",
          139 => "\xe2\x80\xb9",
          140 => "\xc5\x92",
    
          142 => "\xc5\xbd",
    
    
          145 => "\xe2\x80\x98",
          146 => "\xe2\x80\x99",
          147 => "\xe2\x80\x9c",
          148 => "\xe2\x80\x9d",
          149 => "\xe2\x80\xa2",
          150 => "\xe2\x80\x93",
          151 => "\xe2\x80\x94",
          152 => "\xcb\x9c",
          153 => "\xe2\x84\xa2",
          154 => "\xc5\xa1",
          155 => "\xe2\x80\xba",
          156 => "\xc5\x93",
    
          158 => "\xc5\xbe",
          159 => "\xc5\xb8"
        );
    
        protected static $brokenUtf8ToUtf8 = array(
          "\xc2\x80" => "\xe2\x82\xac",
    
          "\xc2\x82" => "\xe2\x80\x9a",
          "\xc2\x83" => "\xc6\x92",
          "\xc2\x84" => "\xe2\x80\x9e",
          "\xc2\x85" => "\xe2\x80\xa6",
          "\xc2\x86" => "\xe2\x80\xa0",
          "\xc2\x87" => "\xe2\x80\xa1",
          "\xc2\x88" => "\xcb\x86",
          "\xc2\x89" => "\xe2\x80\xb0",
          "\xc2\x8a" => "\xc5\xa0",
          "\xc2\x8b" => "\xe2\x80\xb9",
          "\xc2\x8c" => "\xc5\x92",
    
          "\xc2\x8e" => "\xc5\xbd",
    
    
          "\xc2\x91" => "\xe2\x80\x98",
          "\xc2\x92" => "\xe2\x80\x99",
          "\xc2\x93" => "\xe2\x80\x9c",
          "\xc2\x94" => "\xe2\x80\x9d",
          "\xc2\x95" => "\xe2\x80\xa2",
          "\xc2\x96" => "\xe2\x80\x93",
          "\xc2\x97" => "\xe2\x80\x94",
          "\xc2\x98" => "\xcb\x9c",
          "\xc2\x99" => "\xe2\x84\xa2",
          "\xc2\x9a" => "\xc5\xa1",
          "\xc2\x9b" => "\xe2\x80\xba",
          "\xc2\x9c" => "\xc5\x93",
    
          "\xc2\x9e" => "\xc5\xbe",
          "\xc2\x9f" => "\xc5\xb8"
        );
    
        protected static $utf8ToWin1252 = array(
          "\xe2\x82\xac" => "\x80",
    
          "\xe2\x80\x9a" => "\x82",
          "\xc6\x92"     => "\x83",
          "\xe2\x80\x9e" => "\x84",
          "\xe2\x80\xa6" => "\x85",
          "\xe2\x80\xa0" => "\x86",
          "\xe2\x80\xa1" => "\x87",
          "\xcb\x86"     => "\x88",
          "\xe2\x80\xb0" => "\x89",
          "\xc5\xa0"     => "\x8a",
          "\xe2\x80\xb9" => "\x8b",
          "\xc5\x92"     => "\x8c",
    
          "\xc5\xbd"     => "\x8e",
    
    
          "\xe2\x80\x98" => "\x91",
          "\xe2\x80\x99" => "\x92",
          "\xe2\x80\x9c" => "\x93",
          "\xe2\x80\x9d" => "\x94",
          "\xe2\x80\xa2" => "\x95",
          "\xe2\x80\x93" => "\x96",
          "\xe2\x80\x94" => "\x97",
          "\xcb\x9c"     => "\x98",
          "\xe2\x84\xa2" => "\x99",
          "\xc5\xa1"     => "\x9a",
          "\xe2\x80\xba" => "\x9b",
          "\xc5\x93"     => "\x9c",
    
          "\xc5\xbe"     => "\x9e",
          "\xc5\xb8"     => "\x9f"
        );
    
        static function toUTF8($text){
          /**
           * Function Encoding::toUTF8
           *
           * This function leaves UTF-8 characters alone, while converting
           * almost all non-UTF8 to UTF8.
           *
           * It assumes that the encoding of the original string is
           * either Windows-1252 or ISO 8859-1.
           *
           * It may fail to convert characters to UTF-8 if they fall
           * into one of these scenarios:
           *
           * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
           *    are followed by any of these:  ("group B")
           *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
           *
           *    For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
           *    The "«" (%AB) character will be converted, but the "É"
           *    followed by "»" (%C9%BB) is also a valid unicode
           *    character, and will be left unchanged.
           *
           * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO
           *    characters from group B,
           *
           * 3) when any of these: ðñòó  are followed by THREE
           *    characters from group B.
           *
           * @name toUTF8
           * @param string $text  Any string.
           * @return string  The same string, UTF-8 encoded
           *
           */
    
          if(is_array($text))
          {
            foreach($text as $k => $v)
            {
              $text[$k] = self::toUTF8($v);
            }
            return $text;
          } elseif(is_string($text)) {
    
            $max = strlen($text);
            $buf = "";
            for($i = 0; $i < $max; $i++){
                $c1 = $text{$i};
                if($c1>="\xc0"){ // Should be converted to UTF-8, if it's not UTF-8 already
                  $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
                  $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
                  $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
                    if($c1 >= "\xc0" & $c1 <= "\xdf"){ // Looks like 2 bytes UTF-8
                        if($c2 >= "\x80" && $c2 <= "\xbf"){ // Yeah, almost sure it's UTF-8 already
                            $buf .= $c1 . $c2;
                            $i++;
                        } else { // Not valid UTF-8.  Convert it.
                            $cc1 = (chr(ord($c1) / 64) | "\xc0");
                            $cc2 = ($c1 & "\x3f") | "\x80";
                            $buf .= $cc1 . $cc2;
                        }
                    } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ // Looks like 3 bytes UTF-8
                        if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ // Yeah, almost sure it's UTF-8 already
                            $buf .= $c1 . $c2 . $c3;
                            $i = $i + 2;
                        } else { // Not valid UTF-8.  Convert it.
                            $cc1 = (chr(ord($c1) / 64) | "\xc0");
                            $cc2 = ($c1 & "\x3f") | "\x80";
                            $buf .= $cc1 . $cc2;
                        }
                    } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ // Looks like 4 bytes UTF-8
                        if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ // Yeah, almost sure it's UTF-8 already
                            $buf .= $c1 . $c2 . $c3;
                            $i = $i + 2;
                        } else { // Not valid UTF-8.  Convert it.
                            $cc1 = (chr(ord($c1) / 64) | "\xc0");
                            $cc2 = ($c1 & "\x3f") | "\x80";
                            $buf .= $cc1 . $cc2;
                        }
                    } else { // It doesn't look like UTF-8, but should be converted
                            $cc1 = (chr(ord($c1) / 64) | "\xc0");
                            $cc2 = (($c1 & "\x3f") | "\x80");
                            $buf .= $cc1 . $cc2;
                    }
                } elseif(($c1 & "\xc0") == "\x80"){ // Needs conversion
                      if(isset(self::$win1252ToUtf8[ord($c1)])) { // Found in Windows 1252 special cases
                          $buf .= self::$win1252ToUtf8[ord($c1)];
                      } else {
                        $cc1 = (chr(ord($c1) / 64) | "\xc0");
                        $cc2 = (($c1 & "\x3f") | "\x80");
                        $buf .= $cc1 . $cc2;
                      }
                } else { // It doesn't need convesion
                    $buf .= $c1;
                }
            }
            return $buf;
          } else {
            return $text;
          }
        }
    
        static function toWin1252($text) {
          if(is_array($text)) {
            foreach($text as $k => $v) {
              $text[$k] = self::toWin1252($v);
            }
            return $text;
          } elseif(is_string($text)) {
            return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
          } else {
            return $text;
          }
        }
    
        static function toISO8859($text) {
          return self::toWin1252($text);
        }
    
        static function toLatin1($text) {
          return self::toWin1252($text);
        }
    
        static function fixUTF8($text){
          if(is_array($text)) {
            foreach($text as $k => $v) {
              $text[$k] = self::fixUTF8($v);
            }
            return $text;
          }
    
          $last = "";
          while($last <> $text){
            $last = $text;
            $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
          }
          $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
          return $text;
        }
    
        static function UTF8FixWin1252Chars($text){
          // If you received an UTF-8 string that was converted
          // from Windows-1252 as it was ISO8859-1
          // (ignoring Windows-1252 chars from 80 to 9F) use
          // this function to fix it.
          // See: http://en.wikipedia.org/wiki/Windows-1252
    
          return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
        }
    
        static function removeBOM($str=""){
          if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
            $str=substr($str, 3);
          }
          return $str;
        }
      }
    ?>
    

    For using it, you need to include the script with this class and use it like:

    Encoding::toUtf8('Bankdrücken');