Search code examples
htmlperlhtml-content-extractiontext-extractionhtmlcleaner

Extract text from HTML - Perl using HTML::TreeBuilder


I'm trying to access the .html files and extract the text in <p> tags. Logically, my code below should work. By using the HTML::TreeBuilder. I parse the html then extract text in <p> using find_by_attribute("p"). But my script came out with empty directories. Did i leave out anything?

#!/usr/bin/perl

use strict;
use HTML::TreeBuilder 3;
use FileHandle;

my @task = ('ar','cn','en','id','vn');

foreach my $lang (@task) {
mkdir "./extract_$lang", 0777 unless -d "./extract_$lang";
opendir (my $dir, "./$lang/") or die "$!";
my @files = grep (/\.html/,readdir ($dir));
closedir ($dir);

foreach my $file (@files) {
    open (my $fh, '<', "./$lang/$file") or die "$!";
    my $root = HTML::TreeBuilder->new;
    $root->parse_file("./$lang/$file");
    my @all_p = $root->find_by_attribute("p");
    foreach my $p (@all_p) {
        my $ptag = HTML::TreeBuilder->new_from_content ($p->as_HTML);
        my $filewrite = substr($file, 0, -5); 
        open (my $outwrite, '>>', "extract_$lang/$filewrite.txt") or die $!;
        print $outwrite $ptag->as_text . "\n";  
        my $pcontents = $ptag->as_text;
        print $pcontents . "\n";
        close (outwrite);
    }
close (FH);
}
}

My .html files are the plain text htmls from .asp websites e.g. http://www.singaporemedicine.com/vn/hcp/med_evac_mtas.asp

My .html files are saved in:

./ar/*
./cn/*
./en/*
./id/*
./vn/*

Solution

  • You are confusing element with attribute. The program can be written much more concisely:

    #!/usr/bin/env perl
    use strictures;
    use File::Glob qw(bsd_glob);
    use Path::Class qw(file);
    use URI::file qw();
    use Web::Query qw(wq);
    use autodie qw(:all);
    
    foreach my $lang (qw(ar cn en id vn)) {
        mkdir "./extract_$lang", 0777 unless -d "./extract_$lang";
        foreach my $file (bsd_glob "./$lang/*.html") {
            my $basename = file($file)->basename;
            $basename =~ s/[.]html$/.txt/;
            open my $out, '>>:encoding(UTF-8)', "./extract_$lang/$basename";
            $out->say($_) for wq(URI::file->new_abs($file))->find('p')->text;
            close $out;
        }
    }