Search code examples
pythonbeautifulsoupweb-crawlerncbi

beautifulsoup web crawling search id list


I am attempting to crawl the ncbi eutils webpage. I want to crawl the Id list from the web as shown in the below:

image

Here's the code for it:

  import requests
  from bs4 import BeautifulSoup  

  def get_html(url):  
      """get the content of the url"""  
      response = requests.get(url)
      response.encoding = 'utf-8'  
      return response.text  


  def get_pmid(html):   
      soup = BeautifulSoup(html, 'lxml')
      for texts in soup.select('body'):
          text = texts.get_text()
          print text

  url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"  
  html = get_html(url_ncbi)
  get_pmid(html)

I want to use the select function to acquire the text but cannot find the right code for the script: for texts in soup.select(' ').

I'm confused by the multiple layers of classes and ids from the web code like this:

like this


Solution

  • For getting all ID tags you can use find_all() function:

    import requests
    from bs4 import BeautifulSoup  
    
    
    def get_html(url):  
      """get the content of the url"""  
      response = requests.get(url)
      response.encoding = 'utf-8'  
      return response.text  
    
    
    def get_pmid(html):   
        soup = BeautifulSoup(html, 'lxml') 
        rv = []
        for id_tag in soup.find_all('id'):
            rv.append(id_tag.text)
        return rv
    
    url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"  
    html = get_html(url_ncbi)
    all_ids = get_pmid(html)
    print(all_ids)
    

    Prints:

    ['29737393', '29209902', '24632028', '23727638', '22536244', '22052867', '15371742', '12204559', '10885798', '16348362', '3096335', '3734807', '6247641', '6997858', '761345', '108510', '355840', '1003285', '4676550', '5804470', '6076800', '6076775', '6012920', '14091285']