import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
class SunBizSpider(scrapy.Spider):
name = 'sunbiz'
start_urls = ['http://search.sunbiz.org/Inquiry/CorporationSearch/SearchResults?inquiryType=EntityName&searchNameOrder=A&searchTerm=a']
def parse(self, response):
leurl = 'http://search.sunbiz.org'
next_plis = response.xpath("//div[@class='navigationBar'][1]//a[@title='Next List']/@href").extract()
next_lis = (leurl+ ', '.join(next_plis))
yield scrapy.Request(next_lis, callback=self.parse)
for href in response.css('.large-width a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_biz)
def parse_biz(self, response):
re1='((?:[0]?[1-9]|[1][012])[-:\\/.](?:(?:[0-2]?\\d{1})|(?:[3][01]{1}))[-:\\/.](?:(?:[1]{1}\\d{1}\\d{1}\\d{1})|(?:[2]{1}\\d{3})))(?![\\d])' # MMDDYYYY 1
date = response.xpath('//span').re_first(re1)
yield {
'Name': response.css('.corporationName span::text').extract()[1],
'Date': date,
'Link': response.url,
}
Regular expression would most likely find the words inact and cross reff
As you can see above I highlighted words such as inact
, name hs
and cross rf
which are the items I want the crawler to check for and not do anything if it has those words.
you could use the xpath
selector to check inner text, so if you want to get all the td
with inner text Active
, use something like:
response.xpath('//td[text()="Active"]')
The same for other strings, you could also use:
response.xpath('//td[contains(text(), "Activ")]')
if you only want one part of the string