Search code examples
phpxsltcurlxpathpubmed

Parsing curl results from a PubMed query and formatting them into a citation


This is a follow-up question to this question.

Same idea: I'm pulling data from PubMed as XML and using curl to process those results. This allows me to grab the information I need (a list of pub IDs) and use that as a variable for ANOTHER PubMed scrape. The $name will eventually be dynamic.

<?php 
$name = 'white,theodore';
// Return xml data from PubMed based on author search name
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='.$name.'[author]&retmode=xml&retmax=50');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_VERBOSE, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST');
curl_setopt($ch, CURLOPT_FRESH_CONNECT, 1);

$output = curl_exec($ch);

curl_close($ch);

// Parse the results and concatenate into a string of Publication IDs
$xml=simplexml_load_string($output);
$idList = $xml->IdList;
$ids = "";
foreach($idList->children() as $id) {
    $ids .= $id . ",";
}

// Plug that string of IDs into another PubMed search, this one returning XML data for Publication Summaries
$path = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id='.$ids;

$ch2 = curl_init();
curl_setopt($ch2, CURLOPT_URL, $path);
curl_setopt($ch2, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch2, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch2, CURLOPT_VERBOSE, 0);
curl_setopt($ch2, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch2, CURLOPT_AUTOREFERER, true);
curl_setopt($ch2, CURLOPT_MAXREDIRS, 10);
curl_setopt($ch2, CURLOPT_CUSTOMREQUEST, 'POST');
curl_setopt($ch2, CURLOPT_FRESH_CONNECT, 1);

$data = curl_exec($ch2);

curl_close($ch2);
// Parse those results and print only what is needed for Citation format
$cxml=simplexml_load_string($data);
foreach($cxml->children() as $docsum) {
  foreach($docsum->children() as $item) {
    foreach($item->children() as $details) {
        if ((string) $details['Name'] === 'Author') {echo $details . "., ";}
    }
    if ((string) $item['Name'] === 'FullJournalName') { echo $item . ". "; }
    if ((string) $item['Name'] === 'Title') { echo "<strong>" . $item . "</strong> "; }
    if ((string) $item['Name'] === 'Volume') { echo "Vol." . $item . ", "; }
    if ((string) $item['Name'] === 'Issue') { echo "Issue" . $item . ". "; }
    if ((string) $item['Name'] === 'PubDate') { echo $item . ". "; }
    foreach($item->children() as $details) {
            if ((string) $details['Name'] === 'PubType') {echo $details . ", ";}
        }
  }
  echo "</br></br>";
}

?>

Which returns the following XML data (this is ONE result).

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary v1 20041029//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20041029/esummary-v1.dtd">
<eSummaryResult>
  <DocSum>
    <Id>27431223</Id>
    <Item Name="PubDate" Type="Date">2016 Oct</Item>
    <Item Name="EPubDate" Type="Date">2016 Sep 23</Item>
    <Item Name="Source" Type="String">Antimicrob Agents Chemother</Item>
    <Item Name="AuthorList" Type="List">
      <Item Name="Author" Type="String">Bhattacharya S</Item>
      <Item Name="Author" Type="String">Sobel JD</Item>
      <Item Name="Author" Type="String">White TC</Item>
    </Item>
    <Item Name="LastAuthor" Type="String">White TC</Item>
    <Item Name="Title" Type="String">A Combination Fluorescence Assay Demonstrates Increased Efflux Pump Activity as a Resistance Mechanism in Azole-Resistant Vaginal Candida albicans Isolates.</Item>
    <Item Name="Volume" Type="String">60</Item>
    <Item Name="Issue" Type="String">10</Item>
    <Item Name="Pages" Type="String">5858-66</Item>
    <Item Name="LangList" Type="List">
    <Item Name="Lang" Type="String">English</Item>
    </Item>
    <Item Name="NlmUniqueID" Type="String">0315061</Item>
    <Item Name="ISSN" Type="String">0066-4804</Item>
    <Item Name="ESSN" Type="String">1098-6596</Item>
    <Item Name="PubTypeList" Type="List">
      <Item Name="PubType" Type="String">Journal Article</Item>
    </Item>
    <Item Name="RecordStatus" Type="String">Unknown status</Item>
    <Item Name="PubStatus" Type="String">epublish</Item>
    <Item Name="ArticleIds" Type="List">
      <Item Name="pubmed" Type="String">27431223</Item>
      <Item Name="pii" Type="String">AAC.01252-16</Item>
      <Item Name="doi" Type="String">10.1128/AAC.01252-16</Item>
      <Item Name="pmc" Type="String">PMC5038269</Item>
      <Item Name="rid" Type="String">27431223</Item>
      <Item Name="eid" Type="String">27431223</Item>
      <Item Name="pmcid" Type="String">pmc-id: PMC5038269;embargo-date: 2017/04/01;</Item>
    </Item>
    <Item Name="DOI" Type="String">10.1128/AAC.01252-16</Item>
    <Item Name="History" Type="List">
      <Item Name="received" Type="Date">2016/06/10 00:00</Item>
      <Item Name="accepted" Type="Date">2016/07/12 00:00</Item>
      <Item Name="pmc-release" Type="Date">2017/04/01 00:00</Item>
      <Item Name="entrez" Type="Date">2016/07/20 06:00</Item>
      <Item Name="pubmed" Type="Date">2016/07/20 06:00</Item>
      <Item Name="medline" Type="Date">2016/07/20 06:00</Item>
    </Item>
    <Item Name="References" Type="List"></Item>
    <Item Name="HasAbstract" Type="Integer">1</Item>
    <Item Name="PmcRefCount" Type="Integer">0</Item>
    <Item Name="FullJournalName" Type="String">Antimicrobial agents and chemotherapy</Item>
    <Item Name="ELocationID" Type="String">doi: 10.1128/AAC.01252-16</Item>
    <Item Name="SO" Type="String">2016 Oct;60(10):5858-66</Item>
</DocSum>

</eSummaryResult>
</br></br>

Which echoes thus:

2016 Oct. Bhattacharya S., Sobel JD., White TC., A Combination Fluorescence Assay Demonstrates Increased Efflux Pump Activity as a Resistance Mechanism in Azole-Resistant Vaginal Candida albicans Isolates. Vol.60, Issue10. Journal Article, Antimicrobial agents and chemotherapy.

  1. Zavrel M., White TC., Medically important fungi respond to azole drugs: an update. Vol.10, Issue8. Journal Article, Review, Future microbiology.

  2. Esquivel BD., Smith AR., Zavrel M., White TC., Azole drug import into the pathogenic fungus Aspergillus fumigatus. Vol.59, Issue6. Journal Article, Antimicrobial agents and chemotherapy.

2015 Apr. Achterman RR., Moyes DL., Thavaraj S., Smith AR., Blair KM., White TC., Naglik JR., Dermatophytes activate skin keratinocytes via mitogen-activated protein kinase signaling and induce immune responses. Vol.83, Issue4. Journal Article, Infection and immunity.

2015 Feb 3. Ford CB., Funt JM., Abbey D., Issi L., Guiducci C., Martinez DA., Delorey T., Li BY., White TC., Cuomo C., Rao RP., Berman J., Thompson DA., Regev A., The evolution of drug resistance in clinical isolates of Candida albicans. Vol.4, Issue. Journal Article, eLife.

2014 Aug 1. White TC., Findley K., Dawson TL Jr., Scheynius A., Boekhout T., Cuomo CA., Xu J., Saunders CW., Fungi on the skin: dermatophytes and Malassezia. Vol.4, Issue8. Journal Article, Review, Cold Spring Harbor perspectives in medicine.

2014 Jan. Maguire SL., Wang C., Holland LM., Brunel F., Neuvéglise C., Nicaud JM., Zavrel M., White TC., Wolfe KH., Butler G., Zinc finger transcription factors displaced SREBP proteins as the major Sterol regulators during Saccharomycotina evolution. Vol.10, Issue1. Journal Article, PLoS genetics.

2013 Nov 15. Campoli P., Perlin DS., Kristof AS., White TC., Filler SG., Sheppard DC., Pharmacokinetics of posaconazole within epithelial cells and fungi: insights into potential mechanisms of action during treatment and prophylaxis. Vol.208, Issue10. Journal Article, The Journal of infectious diseases.

2013 Jul 8. Achterman RR., White TC., Dermatophytes. Vol.23, Issue13. Journal Article, Current biology : CB.

ETC.

That all works great and produces citations with only the data I need BUT I cannot reorder the results so Author comes first, PubDate comes last, etc. I have tried a number of options, but I'm not familiar enough with any of them and can't seem to crack it.

I've tried attaching an XSLT style sheet but I think that's not working because I'm not actually outputting an XML file. Maybe?

I've tried using the following XPath instead of the SimpleXML block at the bottom to return the citations but keep getting blank results. Because all the data is tagged as <Item>, I'm trying unsuccessfully to use the Name attributes.

$content = simplexml_load_string($data);
$results = $content->xpath('Item[@Name]');
foreach($results as $result) {
  $title = $result->xpath('[@Name="Title"]');
  // Make sure there's an author attribute
  if($title) {
    // because we have a list of elements even if there's one result
    $items = $title[0]->attributes();
    $title = $items['value'];
  }
echo $results;
}

I've also tried DOMXPath and GetElementsByTagName to no avail.

Basically, I'm stuck. I've tried so many variations of each and gotten so many errors I feel like I'm flying in circles. Anyone with more Xpath or XSLT experience have an idea?


Solution

  • Consider the following pure XSLT solution (no foreach loop needed) which handles your node re-ordering and even final output as XSLT can transform XML content to text (i.e., PHP string) and of course even HTML.

    Leave your entire CURL call just as is but replace the nested foreach looping. Below the XSLT script is embedded as string and requires PHP's php_xsl extension enabled in .ini file to access PHP's built-in XSLT 1.0 processor (i.e., libxslt):

    // ... same CURL call ...
    
    // Parse those results and print only what is needed for Citation format
    $cxml=simplexml_load_string($data);
    
    $xslstr = '<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output indent="yes" method="text"/>
    <xsl:strip-space elements="*"/>
    
    <xsl:template match="DocSum">
        <xsl:if test="Item[@Name=\'AuthorList\' and .!=\'\']">
            <xsl:for-each select="Item[@Name=\'AuthorList\' and .!=\'\']/*">
                <xsl:value-of select="." />
                <xsl:if test="position() != last()">
                    <xsl:text>, </xsl:text>
                </xsl:if>           
            </xsl:for-each><xsl:text>. </xsl:text>
        </xsl:if>
    
        <xsl:if test="Item[@Name=\'FullJournalName\' and .!=\'\']">
            <xsl:value-of select="concat(\'&lt;strong&gt;\', Item[@Name=\'FullJournalName\'], 
                                         \'&lt;/strong&gt;\')" />
            <xsl:text>. </xsl:text>
        </xsl:if>
        <xsl:if test="Item[@Name=\'Title\' and .!=\'\']">                        
            <xsl:value-of select="Item[@Name=\'Title\']" /><xsl:text>, </xsl:text>
        </xsl:if>
        <xsl:if test="Item[@Name=\'Volume\' and .!=\'\']">
            <xsl:value-of select="Item[@Name=\'Volume\']" /><xsl:text>. </xsl:text>
        </xsl:if>
        <xsl:if test="Item[@Name=\'Issue\' and .!=\'\']">
            <xsl:value-of select="Item[@Name=\'Issue\']" /><xsl:text>. </xsl:text>
        </xsl:if>
        <xsl:if test="Item[@Name=\'PubDate\' and .!=\'\']">
            <xsl:value-of select="Item[@Name=\'PubDate\']" /><xsl:text>. </xsl:text>
        </xsl:if>
    
        <xsl:if test="Item[@Name=\'PubTypeList\' and .!=\'\']">
            <xsl:for-each select="Item[@Name=\'PubTypeList\']/*">
                <xsl:value-of select="." />
                <xsl:if test="position() != last()">
                    <xsl:text>, </xsl:text>
                </xsl:if>           
            </xsl:for-each><xsl:text>. </xsl:text>
        </xsl:if>
    
        <xsl:text>&lt;br&gt;&lt;br&gt;</xsl:text>
    </xsl:template>     
    </xsl:stylesheet>';
    
    $xsl=simplexml_load_string($xslstr);
    
    // XSLT TRANSFORMATION
    $proc = new XSLTProcessor;
    $proc->importStyleSheet($xsl); 
    $newXML = $proc->transformToXML($cxml);
    
    echo $newXML;
    

    Output

    Citations Output