I have made a scrapy spider that scraped yellow pages for plumbers. The adress is broked up into two parts. So after I have gotten these parts I put them together in the pipelines file. Bellow is the out put for one instance of my spider.
'locality_': 'Manassas, VA 20109',
'logo': None,
'name': 'F H Furr Plumbing-Heating & Air Conditioning',
'number_of_riews': None,
'payment_mentod': 'check',
'phone_number': '(571) 234-6893',
'stars_out_of_five': None,
'street_adress': '9040 Mike Garcia Dr',
'website': 'http://www.fhfurr.com'}
But my problome is I get the error full_address += f'{locailty_} {street_adress}' TypeError: unsupported operand type(s) for +=: 'NoneType' and 'str'
. Why would it do this beacuse both the street address and the locality have values.
spider.py
import scrapy
from yp_scraper.items import PlubmerInfo
class PlumberScraperSpider(scrapy.Spider):
name = 'plumber_scraper'
allowed_domains = ['yellowpages.com']
start_urls = ['https://www.yellowpages.com/']
def parse(self, response):
#bellow are the area name and state these values will be plued into the start url to find the area in wich we are serchign for plumbers
area = "fairfax"
state = "va"#must be an abreation
#the value bellow is the amount of pages you want to scrape
numer_of_pages = 10
page_numebr = 1
#while numer_of_pages > 0:
url = f'https://www.yellowpages.com/{area}-{state}/plumbers?page={page_numebr}'
#page_numebr += 1
#numer_of_pages -= 1
print('tesssssssssssssssst')
yield response.follow(url, callback=self.parse_plumbers)
def parse_plumbers(self, response):
print('text2')
plumber_item = PlubmerInfo()
print('yes_man')
plumbers = response.css('div.result')
for plumber in plumbers:
starter_indidual_url = plumber.css('a.business-name ::attr(href)').get()
indidual_url = f'https://www.yellowpages.com{starter_indidual_url}'
yield response.follow(indidual_url, callback=self.parse_indvidual_plumbers)
def parse_indvidual_plumbers(self, response):
plumber_item = PlubmerInfo()
print(response.xpath('//*[@id="default-ctas"]/a[3]/span/text()').get())
plumber_item['name'] = response.css('h1.business-name ::text').get()
plumber_item['phone_number'] = response.css('a.phone ::text').get()
plumber_item['website'] = response.css('a.website-link ::attr(href)').get()
plumber_item['genral_info'] = response.css('dd.general-info ::text').get()
plumber_item['payment_mentod'] = response.css('dd.payment ::text').get()
plumber_item['stars_out_of_five'] = response.css('div.rating-stars ::attr(class)').get()
plumber_item['number_of_riews'] = response.css('span.count ::text').get()
plumber_item['locality_'] = response.xpath('//*[@id="default-ctas"]/a[3]/span/text()').get()
plumber_item['street_adress'] = response.css('span.address ::text').get()
#plumber_item['services'] = response.css('div.locality ::text').get()
plumber_item['email'] = response.css('a.email-business ::attr(href)').get()
plumber_item['logo'] = response.css('dd.logo ::attr(href)').get()
yield plumber_item
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class YpScraperPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
#combindes locality and street adress
locailty_ = adapter.get('locality_')
street_adress = adapter.get('street_adress')
full_address = adapter.get('full_address')
if locailty_ is not None:
for i in street_adress:
full_address += f'{locailty_} {street_adress}'
return item
This will return None
if "full_address" is missing, which causes the problem:
full_address = adapter.get('full_address')
One way to fix it is to tell .get()
to return a blank string instead of None
as a default value if "full_address" is missing:
full_address = adapter.get('full_address', '')