Search code examples
pythonseleniumscreen-scraping

Extracting data from p tag within div tags


I am trying to collect few information from https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId= using python selenium. The details are within a div tag within p tag and the dic tag is activated only when we click on p tag. I am getting information from the first p tag but cannot iterate through the next p tags. Its only selecting the first p tag and not collecting data from others Also Is it possible to find the number of pages to iterate to the end.

import requests
from bs4 import BeautifulSoup as bs
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select

url = 'https://www.classicalmusicartists.com/cma/artists.aspx'
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(executable_path = '/home/ubuntu/selenium_drivers/chromedriver', options = options)
driver.get(url)
driver.implicitly_wait(2)
dat_html = driver.page_source
category = driver.find_element(By.ID,"ctl00_cphMainContent_lstCategory")
cat=Select(category)
cat.select_by_index(6)
driver.find_element(By.ID, "ctl00_cphMainContent_btnSearch").click()
list_span_elements = driver.find_elements("xpath","//div[@class='artists-by-category']/div/p[@class='expand-heading']")
time.sleep(1)
for x in list_span_elements:
    driver.find_element(By.CLASS_NAME, "expand-heading").click()
    name = x.find_element("xpath","//p['expand-heading clicked']").text
    title = x.find_element("xpath","//div[@class='expand']").text
    manager_name = x.find_element("xpath","//div[@class='artist-management-manager']").text
    time.sleep(0.5)
    country = x.find_element("xpath","//div[@class='artist-management-countries']").text
    category = x.find_element("xpath","//div[@class='artist-management-categories']").text
    contact_num = x.find_element("xpath","//div[@class='artist-management-telephone']").text
    email = x.find_element("xpath","//div[@class='artist-management-email']").text
    website = x.find_element("xpath","//div[@class='artist-management-website']").text
    print(name, "\n",title,"\n", manager_name,"\n", country[9:],"\n", category[10:],"\n", 
          contact_num[3:],"\n", email[3:],"\n", website[3:])

driver.find_element(By.LINK_TEXT, "Next").click()

Solution

  • Solution using scrapy with more elegant way

    1. The webpage isn't dynamic meanimg all the required data is in static HTML DOM

    2. I've made the pagination in starting url using range function and for loop

    Working code as an example:

    import scrapy
    
    class MusicSpider(scrapy.Spider):
        name = 'music'
        start_urls = ['https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num='+str(x)+'' for x in range(1,24)]
    
        def parse(self, response):
            
            for item in response.xpath('//*[@class="expand-heading"]'):
                name = item.xpath('.//text()')
    
                for card in item.xpath('.//following-sibling::*'):
                    if card.root.tag == "p":
                        break
                    title = card.xpath('.//*[@class="artist-management"]/div[1]/text()').get()
                    phone = card.xpath('.//*[@class="artist-label" and contains(text(),"t:")]/../text()').get()
                    email = card.xpath('.//*[@class="artist-label" and contains(text(),"e:")]/../a/text()').get()
                    website = card.xpath('.//*[@class="artist-label" and contains(text(),"w:")]/../a/text()').get()
                    if title or phone or email or website:
                        d = {
                        'Name':''.join(name.getall()).strip().replace('\xa0',''),
                        'title': title,
                        'phonr':phone,
                        'email':email,
                        'website':website
                        }
                        yield d
    

    Output:

    {'Name': 'STOUT,David(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' +44 20 3176 5500', 'email': '[email protected]', 'website': 'www.rayfieldallied.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'STOYANOV,Vladimir(Baritone)', 'title': 'General Manager', 'phonr': ' +39 051 455 395', 'email': None, 'website': 'http://www.melosopera.com/en/'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'STRUCKMANN,Falk(Baritone)', 'title': 'General Manager', 'phonr': None, 'email': '[email protected]', 'website': 'www.arsis-artists.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'SUART,Richard(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' +44 1825 840437', 'email': '[email protected]', 'website': 'www.musichall.uk.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'SULIMSKY,Vladislav(Baritone)', 'title': 'General Manager', 'phonr': ' +33 1 4431 0010', 'email': '[email protected]', 'website': 'www.imgartists.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'SUMEGI,Daniel(Baritone, Bass, Bass-baritone)', 'title': 'Local Manager', 'phonr': ' +61 411 129 690', 'email': '[email protected]', 'website': 'www.patricktogher.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'SUMUEL,Michael(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' +1 212 994 3500', 'email': '[email protected]', 'website': 'www.imgartists.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'SZOT,Paulo(Baritone)', 'title': 'Local Manager', 'phonr': '  +33 (0) 9 77 80 22 43', 'email': None, 'website': 'https://backstage-opera.eu/'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'TANOVITSKI,Alexeï(Baritone, Bass, Bass-baritone)', 'title': 'Regional Manager', 'phonr': ' +33 1 4234 5347', 'email': '[email protected]', 'website': 'www.musicaglotz.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'TERFEL,Bryn(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': '  +44 29 2075 0821', 'email': '[email protected]', 'website': 'www.harlequin-agency.co.uk'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'TÉZIER,Ludovic (Baritone)', 'title': 'General Manager', 'phonr': ' +49 89 290 7470', 'email': '[email protected]', 'website': 'www.hilbert.de'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'TÉZIER,Ludovic(Baritone, Bass-baritone)', 'title': 'General Manager', 'phonr': ' +49 89 290 7470', 'email': '[email protected]', 'website': 'www.hilbert.de'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'THATCHER,Harry(Baritone)', 'title': 'General Manager', 'phonr': ' 07720773910', 'email': None, 'website': 
    'www.stevephillipsmanagement.co.uk'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'THIRION,Ivan(Baritone)', 'title': 'General Manager', 'phonr': ' +32 9 330 3990', 'email': '[email protected]', 'website': 'www.arien-artists.com'}
    2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
    {'Name': 'TIBBETTS,John(Baritone)', 'title': 'General Manager', 'phonr': ' +1 617 651 4600', 'email': None, 'website': 'www.athloneartists.com'}
    2022-09-24 17:43:53 [scrapy.core.engine] INFO: Closing spider (finished)
    2022-09-24 17:43:53 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
    {'downloader/request_bytes': 7232,
     'downloader/request_count': 23,
     'downloader/request_method_count/GET': 23,
     'downloader/response_bytes': 1831629,
     'downloader/response_count': 23,
     'downloader/response_status_count/200': 23,
     'elapsed_time_seconds': 13.257796,
     'finish_reason': 'finished',
     'finish_time': datetime.datetime(2022, 9, 24, 11, 43, 53, 287520),
     'item_scraped_count': 457,