Obtaining data from PubMed using python
Pubmed articles have the form: http://www.ncbi.nlm.nih.gov/pubmed/?Id
If you know the id then you can fetch the above and you will have access to the article. The abstract is contained within a structure like :
<div class="abstr"><h3>Abstract</h3><div class=""><p>α-latrotoxin and snake presynaptic phospholipases A2 neurotoxins target the presynaptic membrane of axon terminals of the neuromuscular junction....</p></div></div>
You would then need a tool to extract that. I would suggest using : http://www.crummy.com/software/BeautifulSoup/bs4/doc/
You will still need a tool to actually fetch the html. For that I would use phantom.js or the ever popular requests module.
Your workflow would like something like :
pubmed_ids [1,2,3]
abstracts = []
for id in pubmed_ids:
html_for_id = requests.get('http://www.ncbi.nlm.nih.gov/pubmed/{0}'.format(id))
soup = BeautifulSoup(html_for_id)
abstract = soup.find('selector for abstract')
abstracts.append(abstract)
Wow, I was working on a similar project myself just a week or so ago!
Edit: I recently updated the code to take advantage of BeautifulSoup. I have my own virtualenv for it, but you can install it with pip.
Basically, my program takes a pubmed ID, a DOI, or a text file of lines of pubmed IDs and/or DOIs, and grabs information about the article. It can easily be tweaked for your own needs to obtain the abstract, but here's my code:
import re
import sys
import traceback
from bs4 import BeautifulSoup
import requests
class PubMedObject(object):
soup = None
url = None
# pmid is a PubMed ID
# url is the url of the PubMed web page
# search_term is the string used in the search box on the PubMed website
def __init__(self, pmid=None, url='', search_term=''):
if pmid:
pmid = pmid.strip()
url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % pmid
if search_term:
url = "http://www.ncbi.nlm.nih.gov/pubmed/?term=%s" % search_term
page = requests.get(url).text
self.soup = BeautifulSoup(page, "html.parser")
# set the url to be the fixed one with the PubMedID instead of the search_term
if search_term:
try:
url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % self.soup.find("dl",class_="rprtid").find("dd").text
except AttributeError as e: # NoneType has no find method
print("Error on search_term=%s" % search_term)
self.url = url
def get_title(self):
return self.soup.find(class_="abstract").find("h1").text
#auths is the string that has the list of authors to return
def get_authors(self):
result = []
author_list = [a.text for a in self.soup.find(class_="auths").findAll("a")]
for author in author_list:
lname, remainder = author.rsplit(' ', 1)
#add periods after each letter in the first name
fname = ".".join(remainder) + "."
result.append(lname + ', ' + fname)
return ', '.join(result)
def get_citation(self):
return self.soup.find(class_="cit").text
def get_external_url(self):
url = None
doi_string = self.soup.find(text=re.compile("doi:"))
if doi_string:
doi = doi_string.split("doi:")[-1].strip().split(" ")[0][:-1]
if doi:
url = "http://dx.doi.org/%s" % doi
else:
doi_string = self.soup.find(class_="portlet")
if doi_string:
doi_string = doi_string.find("a")['href']
if doi_string:
return doi_string
return url or self.url
def render(self):
template_text = ''
with open('template.html','r') as template_file:
template_text = template_file.read()
try:
template_text = template_text.replace("{{ external_url }}", self.get_external_url())
template_text = template_text.replace("{{ citation }}", self.get_citation())
template_text = template_text.replace("{{ title }}", self.get_title())
template_text = template_text.replace("{{ authors }}", self.get_authors())
template_text = template_text.replace("{{ error }}", '')
except AttributeError as e:
template_text = template_text.replace("{{ external_url }}", '')
template_text = template_text.replace("{{ citation }}", '')
template_text = template_text.replace("{{ title }}", '')
template_text = template_text.replace("{{ authors }}", '')
template_text = template_text.replace("{{ error }}", '<!-- Error -->')
return template_text.encode('utf8')
def start_table(f):
f.write('\t\t\t\t\t\t\t\t\t<div class="resourcesTable">\n');
f.write('\t\t\t\t\t\t\t\t\t\t<table border="0" cellspacing="0" cellpadding="0">\n');
def end_table(f):
f.write('\t\t\t\t\t\t\t\t\t\t</table>\n');
f.write('\t\t\t\t\t\t\t\t\t</div>\n');
def start_accordion(f):
f.write('\t\t\t\t\t\t\t\t\t<div class="accordion">\n');
def end_accordion(f):
f.write('\t\t\t\t\t\t\t\t\t</div>\n');
def main(args):
try:
# program's main code here
print("Parsing pmids.txt...")
with open('result.html', 'w') as sum_file:
sum_file.write('<!--\n')
with open('pmids.txt','r') as pmid_file:
with open('result.html','a') as sum_file:
for pmid in pmid_file:
sum_file.write(pmid)
sum_file.write('\n-->\n')
with open('pmids.txt','r') as pmid_file:
h3 = False
h4 = False
table_mode = False
accordion_mode = False
with open('result.html', 'a') as sum_file:
for pmid in pmid_file:
if pmid[:4] == "####":
if h3 and not accordion_mode:
start_accordion(sum_file)
accordion_mode = True
sum_file.write('\t\t\t\t\t\t\t\t\t<h4><a href="#">%s</a></h4>\n' % pmid[4:].strip())
h4 = True
elif pmid[:3] == "###":
if h4:
if table_mode:
end_table(sum_file)
table_mode = False
end_accordion(sum_file)
h4 = False
accordion_mode = False
elif h3:
end_table(sum_file)
table_mode = False
sum_file.write('\t\t\t\t\t\t\t\t<h3><a href="#">%s</a></h3>\n' % pmid[3:].strip())
h3 = True
elif pmid.strip():
if (h3 or h4) and not table_mode:
start_table(sum_file)
table_mode = True
if pmid[:4] == "http":
if pmid[:18] == "http://dx.doi.org/":
sum_file.write(PubMedObject(search_term=pmid[18:]).render())
else:
print("url=%s" % pmid)
p = PubMedObject(url=pmid).render()
sum_file.write(p)
print(p)
elif pmid.isdigit():
sum_file.write(PubMedObject(pmid).render())
else:
sum_file.write(PubMedObject(search_term=pmid).render())
if h3:
if h4:
end_table(sum_file)
end_accordion(sum_file)
else:
end_table(sum_file)
pmid_file.close()
print("Done!")
except BaseException as e:
print traceback.format_exc()
print "Error: %s %s" % (sys.exc_info()[0], e.args)
return 1
except:
# error handling code here
print "Error: %s" % sys.exc_info()[0]
return 1 # exit on error
else:
raw_input("Press enter to exit.")
return 0 # exit errorlessly
if __name__ == '__main__':
sys.exit(main(sys.argv))
It now returns a HTML file based on the information it downloaded. Here is the template.txt:
<tr>{{ error }}
<td valign="top" class="resourcesICO"><a href="{{ external_url }}" target="_blank"><img src="/image/ico_sitelink.gif" width="24" height="24" /></a></td>
<td><a href="{{ external_url }}">{{ title }}</a><br />
{{ authors }}<br />
<em>{{ citation }}</em></td>
</tr>
When you run it, the program will ask you for the DOI or the Pubmed ID. If you do not provide one, it will read pmids.txt.Feel free to use the code as you see fit.
The metapub library was built for this. Metapub been tested on more than 1/3rd of the PubMed database (as of 2019).
from metapub import PubMedFetcher
pmids = [<your list of ids>]
for pmid in pmids:
article = fetch.article_by_id(pmid)
print(article.abstract)
And if you want to get to the full text of each article, you can do this:
from metapub import FindIt
pmids = [<yourlist>]
for pmid in pmids:
src = FindIt(pmid)
print(src.doi)
print(src.url)
I have tested this library against literally millions of articles, to the point where the Medline XML (i.e. Entrez) parser is about 99% robust. And trust me, this data is messy.
Source: I'm the author.
Using Biopython's module called Entrez, you can get the abstract along with all other metadata quite easily. This will print the abstract:
from Bio.Entrez import efetch
def print_abstract(pmid):
handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
print handle.read()
And here is a function that will fetch XML and return just the abstract:
from Bio.Entrez import efetch, read
def fetch_abstract(pmid):
handle = efetch(db='pubmed', id=pmid, retmode='xml')
xml_data = read(handle)[0]
try:
article = xml_data['MedlineCitation']['Article']
abstract = article['Abstract']['AbstractText'][0]
return abstract
except IndexError:
return None
P.S. I actually had the need to do this kind of stuff in a real task, so I organized the code into a class -- see this gist.