I have a list of PubMed entries along with the PubMed ID's. I would like to create a python script or use python which accepts a PubMed id number as an input and then fetches the abstract from the PubMed website.
So far I have come across NCBI Eutilities and the importurl library in Python but I don't know how I should go about writing a template.
Any pointers will be appreciated.
Thank you,
Wow, I was working on a similar project myself just a week or so ago!
Edit: I recently updated the code to take advantage of BeautifulSoup. I have my own virtualenv for it, but you can install it with pip.
Basically, my program takes a pubmed ID, a DOI, or a text file of lines of pubmed IDs and/or DOIs, and grabs information about the article. It can easily be tweaked for your own needs to obtain the abstract, but here's my code:
import re
import sys
import traceback
from bs4 import BeautifulSoup
import requests
class PubMedObject(object):
soup = None
url = None
# pmid is a PubMed ID
# url is the url of the PubMed web page
# search_term is the string used in the search box on the PubMed website
def __init__(self, pmid=None, url='', search_term=''):
if pmid:
pmid = pmid.strip()
url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % pmid
if search_term:
url = "http://www.ncbi.nlm.nih.gov/pubmed/?term=%s" % search_term
page = requests.get(url).text
self.soup = BeautifulSoup(page, "html.parser")
# set the url to be the fixed one with the PubMedID instead of the search_term
if search_term:
url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % self.soup.find("dl",class_="rprtid").find("dd").text
except AttributeError as e: # NoneType has no find method
print("Error on search_term=%s" % search_term)
self.url = url
def get_title(self):
return self.soup.find(class_="abstract").find("h1").text
#auths is the string that has the list of authors to return
def get_authors(self):
result = []
author_list = [a.text for a in self.soup.find(class_="auths").findAll("a")]
for author in author_list:
lname, remainder = author.rsplit(' ', 1)
#add periods after each letter in the first name
fname = ".".join(remainder) + "."
result.append(lname + ', ' + fname)
return ', '.join(result)
def get_citation(self):
return self.soup.find(class_="cit").text
def get_external_url(self):
url = None
doi_string = self.soup.find(text=re.compile("doi:"))
if doi_string:
doi = doi_string.split("doi:")[-1].strip().split(" ")[0][:-1]
if doi:
url = "http://dx.doi.org/%s" % doi
doi_string = self.soup.find(class_="portlet")
if doi_string:
doi_string = doi_string.find("a")['href']
if doi_string:
return doi_string
return url or self.url
def render(self):
template_text = ''
with open('template.html','r') as template_file:
template_text = template_file.read()
template_text = template_text.replace("{{ external_url }}", self.get_external_url())
template_text = template_text.replace("{{ citation }}", self.get_citation())
template_text = template_text.replace("{{ title }}", self.get_title())
template_text = template_text.replace("{{ authors }}", self.get_authors())
template_text = template_text.replace("{{ error }}", '')
except AttributeError as e:
template_text = template_text.replace("{{ external_url }}", '')
template_text = template_text.replace("{{ citation }}", '')
template_text = template_text.replace("{{ title }}", '')
template_text = template_text.replace("{{ authors }}", '')
template_text = template_text.replace("{{ error }}", '<!-- Error -->')
return template_text.encode('utf8')
def start_table(f):
f.write('\t\t\t\t\t\t\t\t\t<div class="resourcesTable">\n');
f.write('\t\t\t\t\t\t\t\t\t\t<table border="0" cellspacing="0" cellpadding="0">\n');
def end_table(f):
def start_accordion(f):
f.write('\t\t\t\t\t\t\t\t\t<div class="accordion">\n');
def end_accordion(f):
def main(args):
# program's main code here
print("Parsing pmids.txt...")
with open('result.html', 'w') as sum_file:
with open('pmids.txt','r') as pmid_file:
with open('result.html','a') as sum_file:
for pmid in pmid_file:
with open('pmids.txt','r') as pmid_file:
h3 = False
h4 = False
table_mode = False
accordion_mode = False
with open('result.html', 'a') as sum_file:
for pmid in pmid_file:
if pmid[:4] == "####":
if h3 and not accordion_mode:
accordion_mode = True
sum_file.write('\t\t\t\t\t\t\t\t\t<h4><a href="#">%s</a></h4>\n' % pmid[4:].strip())
h4 = True
elif pmid[:3] == "###":
if h4:
if table_mode:
table_mode = False
h4 = False
accordion_mode = False
elif h3:
table_mode = False
sum_file.write('\t\t\t\t\t\t\t\t<h3><a href="#">%s</a></h3>\n' % pmid[3:].strip())
h3 = True
elif pmid.strip():
if (h3 or h4) and not table_mode:
table_mode = True
if pmid[:4] == "http":
if pmid[:18] == "http://dx.doi.org/":
print("url=%s" % pmid)
p = PubMedObject(url=pmid).render()
elif pmid.isdigit():
if h3:
if h4:
except BaseException as e:
print traceback.format_exc()
print "Error: %s %s" % (sys.exc_info()[0], e.args)
return 1
# error handling code here
print "Error: %s" % sys.exc_info()[0]
return 1 # exit on error
raw_input("Press enter to exit.")
return 0 # exit errorlessly
if __name__ == '__main__':
It now returns a HTML file based on the information it downloaded. Here is the template.txt:
<tr>{{ error }}
<td valign="top" class="resourcesICO"><a href="{{ external_url }}" target="_blank"><img src="/image/ico_sitelink.gif" width="24" height="24" /></a></td>
<td><a href="{{ external_url }}">{{ title }}</a><br />
{{ authors }}<br />
<em>{{ citation }}</em></td>
When you run it, the program will ask you for the DOI or the Pubmed ID. If you do not provide one, it will read pmids.txt.Feel free to use the code as you see fit.