Search code examples
pythonurllibfeedparser

How to query arXiv for a specific year?


I'm using the code shown below in order to retrieve papers from arXiv. I want to retrieve papers that have words "machine" and "learning" in the title. The number of papers is large, therefore I want to implement a slicing by year (published).

How can I request records of 2020 and 2019 in search_query? Please notice that I'm not interested in post-filtering.

import urllib.request

import time
import feedparser

# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = urllib.parse.quote("ti:machine learning")
start = 0
total_results = 5000
results_per_iteration = 1000
wait_time = 3

papers = []

print('Searching arXiv for %s' % search_query)

for i in range(start,total_results,results_per_iteration):
    
    print("Results %i - %i" % (i,i+results_per_iteration))
    
    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                         i,
                                                         results_per_iteration)

    # perform a GET request using the base_url and query
    response = urllib.request.urlopen(base_url+query).read()

    # parse the response using feedparser
    feed = feedparser.parse(response)

    # Run through each entry, and print out information
    for entry in feed.entries:
        #print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
        #print('Title:  %s' % entry.title)
        #feedparser v4.1 only grabs the first author
        #print('First Author:  %s' % entry.author)
        paper = {}
        paper["date"] = entry.published
        paper["title"] = entry.title
        paper["first_author"] = entry.author
        paper["summary"] = entry.summary
        papers.append(paper)
    
    # Sleep a bit before calling the API again
    print('Bulk: %i' % 1)
    time.sleep(wait_time)

Solution

  • According to the arXiv documentation, there is no published or date field available.

    What you can do is to sort the results by date (by adding &sortBy=submittedDate&sortOrder=descending to your query parameters) and stop making requests when you reach 2018.

    Basically your code should be modified like this:

    import urllib.request
    
    import time
    import feedparser
    
    # Base api query url
    base_url = 'http://export.arxiv.org/api/query?';
    
    # Search parameters
    search_query = urllib.parse.quote("ti:machine learning")
    i = 0
    results_per_iteration = 1000
    wait_time = 3
    papers = []
    year = ""  
    print('Searching arXiv for %s' % search_query)
    
    while (year != "2018"): #stop requesting when papers date reach 2018
        print("Results %i - %i" % (i,i+results_per_iteration))
        
        query = 'search_query=%s&start=%i&max_results=%i&sortBy=submittedDate&sortOrder=descending' % (search_query,
                                                             i,
                                                             results_per_iteration)
    
        # perform a GET request using the base_url and query
        response = urllib.request.urlopen(base_url+query).read()
    
        # parse the response using feedparser
        feed = feedparser.parse(response)
        # Run through each entry, and print out information
        for entry in feed.entries:
            #print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
            #print('Title:  %s' % entry.title)
            #feedparser v4.1 only grabs the first author
            #print('First Author:  %s' % entry.author)
            paper = {}
            paper["date"] = entry.published
            year = paper["date"][0:4]
            paper["title"] = entry.title
            paper["first_author"] = entry.author
            paper["summary"] = entry.summary
            papers.append(paper)
        # Sleep a bit before calling the API again
        print('Bulk: %i' % 1)
        i += results_per_iteration
        time.sleep(wait_time)
    

    for the "post-filtering" approach, once enough results are collected, I'd do something like this:

    papers2019 = [item for item in papers if item["date"][0:4] == "2019"]