Search code examples

How can I make scrapy check for a field and ignore searching the link

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request

class SunBizSpider(scrapy.Spider):
name = 'sunbiz'
start_urls = ['']

def parse(self, response):
    leurl = ''
    next_plis = response.xpath("//div[@class='navigationBar'][1]//a[@title='Next List']/@href").extract()
    next_lis = (leurl+ ', '.join(next_plis))
    yield scrapy.Request(next_lis, callback=self.parse)
    for href in response.css('.large-width a::attr(href)'):
        full_url = response.urljoin(href.extract())
        yield scrapy.Request(full_url, callback=self.parse_biz)

def parse_biz(self, response):
    re1='((?:[0]?[1-9]|[1][012])[-:\\/.](?:(?:[0-2]?\\d{1})|(?:[3][01]{1}))[-:\\/.](?:(?:[1]{1}\\d{1}\\d{1}\\d{1})|(?:[2]{1}\\d{3})))(?![\\d])' #        MMDDYYYY 1
    date = response.xpath('//span').re_first(re1)
    yield {
        'Name': response.css('.corporationName span::text').extract()[1],
        'Date': date,
        'Link': response.url,

Regular expression would most likely find the words inact and cross reff What i want the scrawler not to check is in highlight

As you can see above I highlighted words such as inact, name hs and cross rf which are the items I want the crawler to check for and not do anything if it has those words.


  • you could use the xpath selector to check inner text, so if you want to get all the td with inner text Active, use something like:


    The same for other strings, you could also use:

    response.xpath('//td[contains(text(), "Activ")]')

    if you only want one part of the string