Search code examples
perlmojo-useragent

Decoding result with Mojo::UserAgent for regex match


I'm trying to work out why this won't work:

my $url = 'www880740.com';

use Mojo::UserAgent;

my $ua = Mojo::UserAgent->new->max_redirects(3);
$ua->transactor->name( "Mozilla/5.0 (Windows; U; Windows NT 5.1; pl; rv:1.9; Gecko/2008052906 Firefox/3.0" );

my $tx = $ua->get(
    $url =>
    { 'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' }
    );

    my $page_title = $tx->result->dom->at( 'title' )->text;

    print "GOT: $page_title \n";

    foreach my $type (qw/Arabic Armenian Bengali Bopomofo Braille Buhid Canadian_Aboriginal Cherokee Cyrillic Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana  Inherited Kannada Katakana Khmer Lao Limbu  Malayalam  Mongolian Myanmar Ogham Oriya  Runic Sinhala Syriac Tagalog Tagbanwa TaiLe Tamil Telugu Thaana Thai Tibetan/) {
      if ($page_title =~ /\p{$type}/) {

          print "$page_title seems to be $type!\n";
          last;

        }
    }

Basically I want to test the title from the URL, and check if it matches any of those charsets. I'm assuming its because I need to decode it into something the regex can find. It works fine when I slurp a "curled" version of the page into memory. Devel::Peek::Dump gives me:

SV = PV(0x55cd8264d650) at 0x55cd824c4b10
  REFCNT = 1
  FLAGS = (POK,IsCOW,pPOK)
  PV = 0x55cd82655d80 "\301\371\272\317\264\253\306\34644181.com/\301\371\272\317\264\253\306\346\313\304\262\273\317\361/\302\355\273\341\277\252\275\261\275\341\271\373/\317\343\270\333\301\371\272\317\264\253\306\346/\302\355\273\341\277\252\275\261\274\307\302\274/\317\343\270\333\271\322\305\306|\310\374\302\355\273\341\327\312\301\317"\0
  CUR = 91
  LEN = 96
  COW_REFCNT = 0

UPDATE: I finally got this working:

my $page_title = $tx->result->dom->at( 'title' )->text;

use Encode;
use Encode::Detect;
use Encode::HanExtra;
my $page_title = decode("Detect", $page_title);
  
print "GOT: $page_title \n";

foreach my $type (qw/Arabic Armenian Bengali Bopomofo Braille Buhid Canadian_Aboriginal Cherokee Cyrillic Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi Han Hangul Hanunoo Hebrew Hiragana  Inherited Kannada Katakana Khmer Lao Limbu  Malayalam  Mongolian Myanmar Ogham Oriya  Runic Sinhala Syriac Tagalog Tagbanwa TaiLe Tamil Telugu Thaana Thai Tibetan/) {

  if ($page_title =~ /\p{Script_Extensions=$type}/) {

      print "$page_title seems to be $type!\n";
      last;

    }
}

This bit:

my $page_title = decode("Detect", $page_title);

detects attempts to detect the encoding, and then convert to Perl's internal representation (ready for my regex to work). I tried to post my example output, but for some reason it triggered a spam message?


Solution

  • The title is in charset=gb2312 which requires to be decoded into perl internal representation.

    Following code demonstrates decoding and output to console the title for this particular website.

    use strict;
    use warnings;
    use feature 'say';
    
    use utf8;
    
    use Mojo::UserAgent;
    use Encode qw/encode decode/;
    
    binmode STDOUT, 'encoding(UTF-8)';
    
    my $url = 'www880740.com';
    my $ua  = Mojo::UserAgent->new->max_redirects(3);
    
    $ua->transactor->name( 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pl; rv:1.9; Gecko/2008052906 Firefox/3.0' );
    
    my $res = $ua->get( $url )->result;
    
    my $page_title = decode('euc-cn',$res->dom->at('title')->text);
    
    say 'GOT: ' . $page_title;
    
    exit;
    
    my @langs = qw/Arabic Armenian Bengali Bopomofo Braille Buhid
                   Canadian_Aboriginal Cherokee Cyrillic Devanagari
                   Ethiopic Georgian Greek Gujarati Gurmukhi Han
                   Hangul Hanunoo Hebrew Hiragana  Inherited Kannada
                   Katakana Khmer Lao Limbu  Malayalam  Mongolian
                   Myanmar Ogham Oriya Runic Sinhala Syriac Tagalog
                   Tagbanwa TaiLe Tamil Telugu Thaana Thai Tibetan/;
    
    for( @langs ) {
        say "$page_title matches $_!" if $page_title =~ /\p{$_}/;
    }