Search code examples
pythonweb-scrapingscrapyscrapy-pipeline

I keep getting the error TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'


I have made a scrapy spider that scraped yellow pages for plumbers. The adress is broked up into two parts. So after I have gotten these parts I put them together in the pipelines file. Bellow is the out put for one instance of my spider.

 'locality_': 'Manassas, VA 20109',
 'logo': None,
 'name': 'F H Furr Plumbing-Heating & Air Conditioning',
 'number_of_riews': None,
 'payment_mentod': 'check',
 'phone_number': '(571) 234-6893',
 'stars_out_of_five': None,
 'street_adress': '9040 Mike Garcia Dr',
 'website': 'http://www.fhfurr.com'}

But my problome is I get the error full_address += f'{locailty_} {street_adress}' TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'. Why would it do this beacuse both the street address and the locality have values.

spider.py

import scrapy
from yp_scraper.items import PlubmerInfo

class PlumberScraperSpider(scrapy.Spider):
    name = 'plumber_scraper'
    allowed_domains = ['yellowpages.com']
    start_urls = ['https://www.yellowpages.com/']

    def parse(self, response):
        #bellow are the area name and state these values will be plued into the start url to find the area in wich we are serchign for plumbers
        area = "fairfax"
        state = "va"#must be an abreation
        #the value bellow is the amount of pages you want to scrape
        numer_of_pages = 10 
        page_numebr = 1

        #while numer_of_pages > 0:
        url = f'https://www.yellowpages.com/{area}-{state}/plumbers?page={page_numebr}'
            #page_numebr += 1
            #numer_of_pages -= 1
        print('tesssssssssssssssst')
        yield response.follow(url, callback=self.parse_plumbers)

    def parse_plumbers(self, response):
        print('text2')
        plumber_item = PlubmerInfo()
        print('yes_man')
        plumbers = response.css('div.result')

        for plumber in plumbers:
            starter_indidual_url = plumber.css('a.business-name ::attr(href)').get()
            indidual_url = f'https://www.yellowpages.com{starter_indidual_url}'
            yield response.follow(indidual_url, callback=self.parse_indvidual_plumbers)

    def parse_indvidual_plumbers(self, response):
        plumber_item = PlubmerInfo()
        print(response.xpath('//*[@id="default-ctas"]/a[3]/span/text()').get())
        plumber_item['name'] = response.css('h1.business-name ::text').get()
        plumber_item['phone_number'] = response.css('a.phone ::text').get() 
        plumber_item['website'] = response.css('a.website-link ::attr(href)').get()
        plumber_item['genral_info'] = response.css('dd.general-info ::text').get()
        plumber_item['payment_mentod'] = response.css('dd.payment ::text').get()
        plumber_item['stars_out_of_five'] = response.css('div.rating-stars ::attr(class)').get()
        plumber_item['number_of_riews'] = response.css('span.count ::text').get()
        plumber_item['locality_'] = response.xpath('//*[@id="default-ctas"]/a[3]/span/text()').get()
        plumber_item['street_adress'] = response.css('span.address ::text').get()
        #plumber_item['services'] = response.css('div.locality ::text').get()
        plumber_item['email'] = response.css('a.email-business ::attr(href)').get()
        plumber_item['logo'] = response.css('dd.logo ::attr(href)').get()

        yield plumber_item

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class YpScraperPipeline:
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)

        #combindes locality and street adress
        locailty_ = adapter.get('locality_')
        street_adress = adapter.get('street_adress')
        full_address = adapter.get('full_address')
        if locailty_ is not None:
            for i in street_adress:
                full_address += f'{locailty_} {street_adress}'


        return item


Solution

  • This will return None if "full_address" is missing, which causes the problem:

    full_address = adapter.get('full_address')
    

    One way to fix it is to tell .get() to return a blank string instead of None as a default value if "full_address" is missing:

    full_address = adapter.get('full_address', '')