Search code examples
pythonweb-scrapingbeautifulsouptags

beautiful soup need help finding multiple tags


from bs4 import BeautifulSoup
from lxml import etree
import requests
import re

URL = "https://csimarket.com/stocks/at_glance.php?code=AA"

HEADERS = ({'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', \
            'Accept-Language': 'en-US, en;q=0.5'})

webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
raw_html = soup.find(href="../Industry/Industry_Data.php?s=100")
span = raw_html.find("span")
span.decompose()
print(raw_html.text.strip())

code works fine raw_html = soup.find(href="../Industry/Industry_Data.php?s=100") as I am going thru other pages this part will different ../Industry/Industry_Data.php?s=1000

how do I search for just "../Industry/Industry_Data.php"


Solution

  • Select your elements with css selectors and check if <span> exists:

    for a in soup.select('a[href*="../Industry/Industry_Data.php"]'):
        if a.span:
            a.span.decompose()
        print(a.text.strip())
    

    Example

    from bs4 import BeautifulSoup
    import requests
    
    URL = "https://csimarket.com/stocks/at_glance.php?code=AA"
    
    HEADERS = ({'User-Agent':
                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
                (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', \
                'Accept-Language': 'en-US, en;q=0.5'})
    
    webpage = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")
    
    for a in soup.select('a[href*="../Industry/Industry_Data.php"]'):
        if a.span:
            a.span.decompose()
        print(a.text.strip())
    

    Output

    Industries At a Glance
    Basic Materials
    Aluminum
    Aluminum
    Basic Materials