Search code examples
perlatom-feedbibtex

from arXiv ID to BibTeX entry


How can I write a robust Perl script that will generate a BibTeX entry for an arXiv ID?

My guess is that I should use the arXiv API and parse its response with XML::Atom. It should give me the needed pieces of information to build a BibTeX entry.

Here is how I would start:

use LWP::UserAgent;
use Text::BibTeX::Entry;
use XML::Atom;

my $arxivid = "hep-ph/9609357";
my $url = "http://export.arxiv.org/api/query?search_query=" . $arxivid . "&start=0&max_results=1";
my $browser = LWP::UserAgent->new();
my $response = $browser->get($url);
my $entry = Text::BibTeX::Entry->new();

Answers not using the arXiv API or XML::Atom are welcome too.


Solution

  • Here is a starting point using XML::Twig to parse the downloaded XML file:

    use feature qw(say);
    use strict;
    use warnings;
    use LWP::UserAgent;
    use Text::BibTeX;
    use Text::BibTeX::Entry;
    use XML::Twig;
    use DateTime::Format::Strptime;
    {
        my $arxivid = "hep-ph/9609357";
        my $url = "http://export.arxiv.org/api/query?search_query=" . $arxivid . "&start=0&max_results=1";
        my $browser = LWP::UserAgent->new();
        my $response = $browser->get($url);
        my $xml = $response->content;
        my $twig = XML::Twig->new->parse( $xml );
        my $title = $twig->get_xpath ( '//entry/title',0 )->text;
        my @authors;
        for my $node ( $twig->findnodes( '//entry/author/name' )) {
            push @authors, $node->text;
        }
        my $doi = $twig->get_xpath ( '//entry/link[@title="doi"]',0 )->att('href');
        my $published = $twig->get_xpath ( '//entry/published',0 )->text;
        my ( $year, $month) = parse_published( $published) ;
    
        my $entry = Text::BibTeX::Entry->new();
        $entry->set_metatype(BTE_REGULAR);
        $entry->set_type('article');
        $entry->set_key('article1');
        $entry->set( 'title', $title );
        $entry->set( 'author', join ' and ', @authors );
        $entry->set( 'year', $year );
        $entry->set( 'month', $month );
        $entry->set( 'doi', $doi );
        $entry->print(\*STDOUT);
    }
    
    sub parse_published  {
        my ( $published) = @_;
    
        my $parser = DateTime::Format::Strptime->new(
            pattern => '%FT%T%Z',
            time_zone => 'UTC',
            on_error => 'croak',
        );
    
        my $dt = $parser->parse_datetime($published);
    
        return ( $dt->year, $dt->month_name);
    }
    

    Output:

    @article{article1,
      title = {Mixing-induced CP Asymmetries in Inclusive $B$ Decays},
      author = {Martin Beneke and Gerhard Buchalla and Isard Dunietz},
      year = {1996},
      month = {September},
      doi = {http://dx.doi.org/10.1016/S0370-2693(96)01648-6},
    }