python selenium web-scraping beautifulsoup google-colaboratory

Getting AttributeError: 'NoneType' object has no attribute 'text' (web-scraping)

This is my case study about web scraping. I got a problem in the final code 'NoneType' object has no attribute 'text' so I tried to fix it with 'getattr' function but it didn't work.

'''

import requests
from bs4 import BeautifulSoup

url = 'https://www.birdsnest.com.au/womens/dresses'

source = requests.get(url)
soup = BeautifulSoup(source.content, 'lxml')

'''

productlist= soup.find_all('div', id='items')

'''

productlinks = []
for item in productlist:
  for link in item.find_all('a',href=True):
      productlinks.append(url + link['href'])
print(len(productlinks))

'''

productlinks = []
for x in range(1,28):
  source = requests.get(f'https://www.birdsnest.com.au/womens/dresses?_lh=1&page={x}')
  soup = BeautifulSoup(source.content, 'lxml')
  for item in productlist:
      for link in item.find_all('a',href=True):
        productlinks.append(url + link['href'])
print(productlinks)

'''

for link in productlinks:
    source = requests.get(link)
    soup = BeautifulSoup(source.content, 'lxml')

    name = soup.find('h1',class_='item-heading__name').text.strip()
    price = soup.find('p',class_='item-heading__price').text.strip()
    feature = soup.find('div',class_='tab-accordion__content active').text.strip()

    sum = {
      'name':name,
      'price':price,
      'feature':feature
          }
    print(sum)

'''

  ---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-7-d4d46558690d> in <module>()
      3     soup = BeautifulSoup(source.content, 'lxml')
      4 
----> 5     name = soup.find('h1',class_='item-heading__name').text.strip()
      6     price = soup.find('p',class_='item-heading__price').text.strip()
      7     feature = soup.find('div',class_='tab-accordion__content active').text.strip()

AttributeError: 'NoneType' object has no attribute 'text'

---------------------------------------------------------------------------

So I tried to fix with this method, but it didn't work.

 for link in productlinks:
    source = requests.get(link)
    soup = BeautifulSoup(source.content, 'lxml')

    name = getattr(soup.find('h1',class_='item-heading__name'),'text',None)
    price = getattr(soup.find('p',class_='item-heading__price'),'text',None)
    feature = getattr(soup.find('div',class_='tab-accordion__content active'),'text',None)

    sum = {
      'name':name,
      'price':price,
      'feature':feature
          }
    print(sum)

This is the output. It show only 'Nonetype'

{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}

Solution

First of all, always turn JS off for the page you're scraping. Then you'll realize that tag classes change and these are the ones you want to target.

Also, when looping through the pages, don't forget that Python's range() stop value is not inclusive. Meaning, this range(1, 28) will stop on page 27.

Here's how I would go about it:

import json

import requests
from bs4 import BeautifulSoup


cookies = {
    "ServerID": "1033",
    "__zlcmid": "10tjXhWpDJVkUQL",
}

headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}


def extract_info(bs: BeautifulSoup, tag: str, attr_value: str) -> list:
    return [i.text.strip() for i in bs.find_all(tag, {"itemprop": attr_value})]


all_pages = []
for page in range(1, 29):
    print(f"Scraping data from page {page}...")

    current_page = f"https://www.birdsnest.com.au/womens/dresses?page={page}"
    source = requests.get(current_page, headers=headers, cookies=cookies)
    soup = BeautifulSoup(source.content, 'html.parser')

    brand = extract_info(soup, tag="strong", attr_value="brand")
    name = extract_info(soup, tag="h2", attr_value="name")
    price = extract_info(soup, tag="span", attr_value="price")

    all_pages.extend(
        [
            {
                "brand": b,
                "name": n,
                "price": p,
            } for b, n, p in zip(brand, name, price)
        ]
    )

print(f"{all_pages}\nFound: {len(all_pages)} dresses.")

with open("all_the_dresses2.json", "w") as jf:
    json.dump(all_pages, jf, indent=4)

This gets you a JSON with all the dresses.

    {
        "brand": "boho bird",
        "name": "Prissy Dress",
        "price": "$189.95"
    },
    {
        "brand": "boho bird",
        "name": "Dandelion Dress",
        "price": "$139.95"
    },
    {
        "brand": "Lula Soul",
        "name": "Dandelion Dress",
        "price": "$179.95"
    },
    {
        "brand": "Honeysuckle Beach",
        "name": "Cotton V-Neck A-Line Splice Dress",
        "price": "$149.95"
    },
    {
        "brand": "Honeysuckle Beach",
        "name": "Lenny Pinafore",
        "price": "$139.95"
    },
and so on for the next 28 pages ...