Search code examples
perlpdfcpan

PDF::FromHTML No Anchors and Early Termination


Further top this question: PDF::FromHTML - Corrupt file and no output

The code in question is 'working' in that it produces a PDF document just fine, just NONE of the HTML anchors are being translated, and on larger documents the processing ceases at Page 11 of the PDF - with no error, it closes the document just fine!

Edit: To save looking at the Question Link:

    # print "<p>".$textblob."</p>";

    $textblob='<html><head></head><body>'.$textblob.'</body></html>';

    # $textblob = decode('UTF-8', $textblob);

    my $output;
    if(defined($query->param('PDF'))){
        my $pdf = PDF::FromHTML->new( encoding => 'utf-8' );
        $pdf->load_file(\$textblob);
        $pdf->convert(
            # With PDF::API2, font names such as 'traditional' also works
             Font        => 'Arial',
             LineHeight  => 10,
             Landscape   => 0,
        );
        $pdf->write_file(\$output);
        print $output;
    }

$textblob when uncommented to print and commenting out the PDF section displays the full 400 reference adventure with links in html just fine...

Update:In desperation here is the entire script (it's not TOO long...)

#!/usr/bin/perl
use cPanelUserConfig;
use CGI::Carp qw(fatalsToBrowser);
use CGI;
use List::Util qw(shuffle);
use PDF::FromHTML;
require "authenticate.pl";

$query = new CGI;

if(defined($query->param('PDF'))){
    print $query->header(-type=>'application/pdf');
}
else{
    print $query->header(-charset=>'utf-8');
    &html_header;
    print "\n\n\n\n<!-- -------------------------- BEGIN: ff.net Script generated text ------------------------------------------- -->";
    print "Randomise working? Let me know if you find a bug.<br />";
}

if(defined($query->param('doc'))){
    $doc=$query->param('doc')."\nEOF";
    %refhash = $doc =~ /^[\n\s\t\.\#]*(\d+)[\s\t\.\#\n]+(?!\n*^[\n\s\t\.\#]*\d+[\s\t\.\#\n]+)(.+?)(?=^[\s\t\.\#\n]*\d+[\s\t\.\#\n]+|EOF)/smcgi; # refhash{key}=content, where key==refnumber and content==well, ref content
    &display_refhash(\%refhash);

}

elsif(defined($query->param('references'))){
    my %anchors;
    my $refhashref=&recreate_refhash($query->param('references'),\%anchors);

    if(defined($query->param('Randomise'))){
        $refhashref=&randomise($refhashref,\%anchors);
        print "Your adventure looks like this: <br /><br />";
        &display_refhash($refhashref);
    }
    elsif(defined($query->param('Save'))){
        &save($refhashref);
    }
    elsif(defined($query->param('Auto-HTML Tag'))){
        print "Your adventure looks like this: <br /><br />";
        &display_refhash($refhashref);
    }
    elsif(defined($query->param('Auto-ABML Tag'))){
        &autoABML($refhashref);
        print "Your adventure looks like this: <br /><br />";
        &display_refhash($refhashref);      
    }
    elsif(defined($query->param('PDF'))){
        &output_pdf($refhashref);
    }
    else{
        print "undefined function call";
    }
}
else{ # output form to input doc content
    print "Please input your document text into the textarea below (copy and paste should do it):";
    print '<form method="post" action="doc_to_refs.cgi" enctype="multipart/form-data" name="doc_to_refs_form">';
    print $query->textarea(-name=>'doc',-rows=>20,-cols=>100, -style=>"font-family:arial;width:98%");
    print $query->submit('Go!');
    print '</form>';
}

&html_footer;

# print "<!-- -------------------------- END: ff.net Script generated text ------------------------------------------- -->";    

sub recreate_refhash{
    my %refhash;
    my $references=shift;
    my $anchors_ref=shift;
    for(my $x=0;$x<$references;$x++){
        my $referencekey="reference"."$x";
        my $referencecontent="reftext"."$x";
        my $anchorname="anchor"."$x";
        my $deletename="delete"."$x";

        if(!defined($query->param($deletename))){
            $refhash{$query->param($referencekey)}=$query->param($referencecontent);

            if(defined($query->param($anchorname))){
                $$anchors_ref{$query->param($anchorname)}=$x;
            }
        }
    }

    return \%refhash;
}

sub randomise{
    my $refhashref=shift;
    my $anchor_ref=shift;
    my %refhash=%$refhashref;
    my %randomisedrefhash, %Xrefhash, @refstack, $ref;
    my %anchors=%$anchor_ref;

    # randomise the list
    @refstack=shuffle sort {$a <=> $b} keys %refhash; # inflict an order on the pre-shuffle (therefore we can xref predicatably?) not sure this makes ANY sense i'm melting....

    ## transpose anchors back to their required location
    for($x=0;$x<@refstack;$x++){
        if(defined($anchors{$refstack[$x]})){
            my $anchor=\$refstack[$anchors{$refstack[$x]}];
            my $temp=$refstack[$x];
            $refstack[$x]=$$anchor;

            print "---Swapping $temp with ".$$anchor;
            $$anchor=$temp;

            if(defined($anchors{$refstack[$x]})){
                if($refstack[$anchors{$refstack[$x]}] ne $$anchor){
                    $x--;
                }
            }
        }
    }

    ## randomise the refs and the content associations, and create the cross-ref hash
    foreach $ref(sort {$a <=> $b} keys %refhash){
        $key=shift @refstack;
        $randomisedrefhash{$ref}=$refhash{$key};
        $Xrefhash{$key}=$ref;
    }

    ## now do the content link substitutions
    foreach $ref(keys %randomisedrefhash){
        $randomisedrefhash{$ref}=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s)*(\d+)/&substitute_xref($1,$2,$3,$4,$5,\%Xrefhash)/egi;
    }

    print "You asked for the following anchors:";
    foreach $key(keys %anchors){
        print $anchors{$key};
    }

    return \%randomisedrefhash;
}

sub substitute_xref{ ## not sure that this is necessary but the verboseness was easier to work out
    my $pretext1=shift;
    my $pretext2=shift;
    my $pretext3=shift;
    my $pretext4=shift;
    my $link=shift;
    my $Xrefhashref=shift;
    my %Xrefhash=%$Xrefhashref;   

    my $newlink=$Xrefhash{$link};

    return "$pretext1$pretext2$pretext3$pretext4$newlink";

}

sub save{
    print "Will Save soon";
}

sub display_refhash{
    my $refhashref=shift;
    my %refhash=%$refhashref;

    print '<form method="post" action="doc_to_refs.cgi" enctype="multipart/form-data" name="doc_to_refs_form">';
    my $x=0;
    my $ref,$textblob;
    foreach $ref (sort {$a <=> $b} keys %refhash){
        my $reference="reference"."$x";
        my $reftext="reftext"."$x";
        my $anchor="anchor"."$x";
        my $delete="delete"."$x";
        my $default=$refhash{$ref};

        print "Reference is: ".$query->textfield(-name=>$reference,-value=>$ref, -override=>1)."<br />";
        print $query->checkbox_group(-name=>$delete,-values=>$ref,-labels=>{$ref=>'Delete Me'})."<br />";
        print $query->checkbox_group(-name=>$anchor,-values=>$ref, -labels=>{$ref=>'Anchor Me (Will NOT get Randomised)'})."<br />";
        print "Content is: ".$query->textarea(-name=>$reftext, -default=>$default, -rows=>5, -override=>1, -cols=>100, -style=>"font-family:arial;width:98%")."<br />";
        print "<br /><br />";

        if((defined($query->param('Auto-HTML Tag'))) or (defined($query->param('PDF')))){
            $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\<a href\=\"\#$5\"\>$1 $2 $3 $4 $5\<\/a\>/gi;
        }       

        if(defined($query->param('Auto-ABML Tag'))){
            # $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\&lt\;tt ref\=\"$5\"\&gt\;$1 $2 $3 $4 $5\&lt\;\/tt\&gt\;/gi;
        }       

        $textblob.=$ref." ".$default."<br /><br />";

        $x++;
    }

    print $query->hidden(-name=>'references',-value=>$x,override=>1);
#   print $query->submit(-name=>'Save');
    print $query->submit('Randomise');
    print $query->submit('Auto-ABML Tag');  
    print $query->submit('Auto-HTML Tag');  
    print $query->submit('PDF');    
    print "</form><br /><br /><br />";
    $textblob=~s/\n/\<br \/\>/gi;

    print "<p>".$textblob."</p>";

}

sub output_pdf{
    my $refhashref=shift;
    my %refhash=%$refhashref;

    my $x=0;
    my $ref,$textblob;
    foreach $ref (sort {$a <=> $b} keys %refhash){
        my $reference="reference"."$x";
        my $reftext="reftext"."$x";
        my $anchor="anchor"."$x";
        my $delete="delete"."$x";
        my $default=$refhash{$ref};

        if((defined($query->param('Auto-HTML Tag'))) or (defined($query->param('PDF')))){
            $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\<a href\=\"\#$5\"\>$1 $2 $3 $4 $5\<\/a\>/gi;
        }       

        if(defined($query->param('Auto-ABML Tag'))){
            # $ref=~s/(\d+)/\<a id\=\"$1\"\>$1\<\/a\>/gi;
            $default=~s/(return\sto|go\sto|turn\sto)(\s+)(page|paragraph|reference|section)*(\s*)(\d+)/\&lt\;tt ref\=\"$5\"\&gt\;$1 $2 $3 $4 $5\&lt\;\/tt\&gt\;/gi;
        }       

        $textblob.=$ref." ".$default."<br /><br />";

        $x++;
    }

    $textblob=~s/\n/\<br \/\>/gi;

    # print "<p>".$textblob."</p>";

    $textblob='<html><head></head><body>'.$textblob.'</body></html>';

    my $output;
    if(defined($query->param('PDF'))){
        my $pdf = PDF::FromHTML->new( encoding => 'utf-8' );
        $pdf->load_file(\$textblob);
        $pdf->convert(
            # With PDF::API2, font names such as 'traditional' also works
             Font        => 'Arial',
             LineHeight  => 10,
             Landscape   => 0,
        );
        $pdf->write_file(\$output);
        print $output;
    }
}

sub html_header{

}

sub html_footer{

}

If you want sample data let me know I'll upload it somewhere


Solution

  • "... and on larger documents the processing ceases at Page 11 of the PDF..."

    This seems to be due to a bug in PDF::FromHTML::Template::Container::PageDef. Notice the line:

    last if $::x++ > 10;
    

    It means it will never create more than 11 pages. I have filed a bug report