Search code examples
phpparsingiconv

Parse page windows-1251 to utf-8. can't grab text


I want to grab names for my product from table.

But I can't grab names with cyrillic characters.

For parsing I'm using ganon.

When I try to output array with names, all value which must contain cyrillic characters are empty. Why?

Please help me to fix it.

    $url = "http://www.plati.ru/asp/block_goods_s.asp?id_r=0&id_s=252900&sort=name&page=1&rows=10&curr=EUR&lang=ru-RU&rnd=1544554";
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/7.0");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_HEADER, false);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 50);
    $result = curl_exec($ch);
    $redir = curl_getinfo($ch, CURLINFO_HEADER_OUT);
    curl_close($ch);

    $html = str_get_dom($result);

    foreach ($html('.link_good_tab') as $element) {
        $temp = str_replace("\xA0", ' ', $element->getPlainText());
        $products[] = iconv(mb_detect_encoding($temp, mb_detect_order(), true), "UTF-8", $temp);
    }
    echo "<pre>";
    print_r($products);
    echo "</pre>";

This is result

    [0] =>
    [1] =>
    [2] =>
    [3] =>
    [4] =>
    [5] =>
    [6] =>
    [7] =>
    [8] =>
    [9] =>
    [10] => C&C: Red Alert 3 - Uprising (Origin/RegFree/Multilang)

Solution

  • Since you already know the encoding, just set it yourself instead, and you can also use mb_convert_encoding(). Example:

    $ch = curl_init('http://www.plati.ru/asp/block_goods_s.asp?id_r=0&id_s=252900&sort=name&page=1&rows=10&curr=EUR&lang=ru-RU&rnd=1544554');
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $result = curl_exec($ch);
    
    $html = str_get_dom($result);
    foreach ($html('.link_good_tab') as $element) {
        $temp = str_replace("\xA0", ' ', $element->getPlainText());
        $products[] = mb_convert_encoding($temp, "utf-8", "windows-1251");
    }
    echo "<pre>";
    print_r($products);
    echo "</pre>";