Search code examples
pythonweb-scrapingbeautifulsoupnonetype

Webscraping Error: 'NoneType' object has no attribute 'text'


Hi I'm trying to scrape data from a website and it worked fine for quite some time and now I get this error message: "'NoneType' object has no attribute 'text'" and it's not scraping any data anymore. They must have changed something on the website but I can't figure out what.

The Error occourse in the line: "data_page = soup_page.find('script', text=r_page).text"

That's my code:

from bs4 import BeautifulSoup as bs
import urllib.request
from urllib.request import urlopen
import requests
import time
import re
from datetime import datetime
import pandas as pd
import json

for seite in range(1):
    
    print("Loop " + str(seite) + " startet.")
    df = pd.DataFrame()
    l=[]

    try:
        
        page = ("https://www.immobilienscout24.de/Suche/radius/neubauwohnung-kaufen?centerofsearchaddress=Krefeld;47799;Grenzstra%C3%9Fe;;;Bockum&geocoordinates=51.33798;6.58608;1.0&enteredFrom=result_list")
        print(page)

        res_page = requests.get(page)
        soup_page = bs(res_page.content, 'lxml')
        r_page = re.compile(r'resultListModel:(.*)')
        data_page = soup_page.find('script', text=r_page).text
        script_page = r_page.findall(data_page)[0].rstrip(',')
        results_page = json.loads(script_page)

        for item in results_page['searchResponseModel']['resultlist.resultlist']['resultlistEntries'][0]['resultlistEntry']:
            l.append(item['@id'])
            if 'similarObjects' in item:
                for i in item['similarObjects'][0]['similarObject']:
                    if isinstance(i,dict):
                        l.append(i['@id'])
                    elif i == '@id':
                        l.append(item['similarObjects'][0]['similarObject'][i])
        l = list(set(l))

Solution

  • The server returns CAPTCHA page if you don's specify User-Agent and Accept-Language HTTP headers:

    import json
    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    
    for seite in range(1):    
        print("Loop " + str(seite) + " startet.")
        df = pd.DataFrame()
        l=[]
    
        page = ("https://www.immobilienscout24.de/Suche/radius/neubauwohnung-kaufen?centerofsearchaddress=Krefeld;47799;Grenzstra%C3%9Fe;;;Bockum&geocoordinates=51.33798;6.58608;1.0&enteredFrom=result_list")
        res_page = requests.get(page, headers=headers)
        soup_page = BeautifulSoup(res_page.content, 'lxml')
        r_page = re.compile(r'resultListModel:(.*)')
        data_page = soup_page.find('script', text=r_page).string
        script_page = r_page.findall(data_page)[0].rstrip(',')
        results_page = json.loads(script_page)
    
        for item in results_page['searchResponseModel']['resultlist.resultlist']['resultlistEntries']:
            item = item['resultlistEntry']
            l.append(item['@id'])
            if 'similarObjects' in item:
                for i in item['similarObjects'][0]['similarObject']:
                    if isinstance(i,dict):
                        l.append(i['@id'])
                    elif i == '@id':
                        l.append(item['similarObjects'][0]['similarObject'][i])
        l = list(set(l))
    
    print(l)
    

    Prints:

    ['119256589', '119215242', '119254488', '119256425', '119254296', '119256175', '119240835']