Search code examples
pythonweb-scrapingscrapycraigslist

Scrapy JSON export issues


I have been following some tutorials online to use Scrapy to scrape Craigslist for emails. I have this code but when I run the command and export to json file, it creates the file but the only thing in there is a single '['.

Any help would be greatly appreciated. Below is my code:

from scrapy.spider import BaseSpider 
from scrapy.selector import HtmlXPathSelector 
from scrapy_demo.items import ScrapyDemoItem
import urlparse 
from scrapy.http.request import Request

class ScrapyDemoSpider(BaseSpider): 
    name = "scrapy_demo"
    allowed_domains = ["buffalo.craigslist.org"]
    start_urls = ['http://buffalo.craigslist.org/search/cps/']

    def parse(self, response):
        hxs = HtmlXPathSelector(response) 
        listings = hxs.select('//....') 
        links = []

        #scrape listings page to get listing links
        for listing in listings:
            link = listing.select('..../@href').extract()[0]
            links.append(link)

        #parse listing url to get content of the listing page
        for link in links:
            item = ScrapyDemoItem()
            item['link'] = link
            yield Request(urlparse.urljoin(response.url, link), meta={'item': item}, callback=self.parse_listing_page)

            #get next button link
            next_page = hxs.select("//..../@href").extract()[0]
            if next_page:
                yield Request(urlparse.urljoin(response.url, next_page), self.parse)

    #scrape listing page to get content
    def parse_listing_page(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.request.meta['item']
        item['title'] = hxs.select('//..../text()').extract()[0]
        item['content'] = hxs.select('//..../text()').extract()[0]
        yield item

Solution

  • Multiple issues here.

    The main problem is in invalid expressions inside the select() calls.

    Aside from that:

    • use response.xpath() or response.css() no need for HtmlXPathSelector anymore
    • no need to instantiate an Item instance in the parse() callback and pass in meta. Get the url from response.url in parse_listing_page() callback

    Improved working code:

    import urlparse
    
    from scrapy.spider import BaseSpider
    from scrapy.http.request import Request
    
    from scrapy_demo.items import ScrapyDemoItem
    
    
    class ScrapyDemoSpider(BaseSpider): 
        name = "scrapy_demo"
        allowed_domains = ["buffalo.craigslist.org"]
        start_urls = ['http://buffalo.craigslist.org/search/cps/']
    
        def parse(self, response):
            # processing listings
            for listing in response.css('p.row > a[data-id]'):
                link = listing.xpath('@href').extract()[0]
                yield Request(urlparse.urljoin(response.url, link), callback=self.parse_listing_page)
    
            # following next page
            next_page = response.xpath('//a[contains(@class, "next")]/@href').extract()
            print next_page
            if next_page:
                yield Request(urlparse.urljoin(response.url, next_page[0]), callback=self.parse)
    
        def parse_listing_page(self, response):
            item = ScrapyDemoItem()
            item['link'] = response.url
            item['title'] = response.xpath('//title/text()').extract()[0].strip()
            item['content'] = response.xpath('//section[@id="postingbody"]/text()').extract()[0].strip()
            yield item
    

    If you run the spider, then in the output JSON file, you would see:

    [
        {"content": "Using a web cam with your computer to video communicate with your loved ones has never been made easier and it's free (providing you have an Internet connection).  With the click of a few buttons, you are sharing your live video and audio with the person you are communicating with. It's that simple.  When you are seeing and hearing your grand kids live across the country or halfway around the world, web camming is the next best thing to being there!", "link": "http://buffalo.craigslist.org/cps/4784390462.html", "title": "Web Cam With Your Computer With Family And Friends"},
        {"content": "Looking to supplement or increase your earnings?", "link": "http://buffalo.craigslist.org/cps/4782757517.html", "title": "1k in 30 Day's"},
        {"content": "Like us on Facebook: https://www.facebook.com/pages/NFB-Systems/514380315268768", "link": "http://buffalo.craigslist.org/cps/4813039886.html", "title": "NFB SYSTEMS COMPUTER SERVICES + WEB DESIGNING"},
        {"content": "Like us on Facebook: https://www.facebook.com/pages/NFB-Systems/514380315268768", "link": "http://buffalo.craigslist.org/cps/4810219714.html", "title": "NFB Systems Computer Repair + Web Designing"},
        {"content": "I can work with you personally and we design your site together (no outsourcing or anything like that!) I'll even train you how to use your brand new site. (Wordpress is really easy to use once it is setup!)", "link": "http://buffalo.craigslist.org/cps/4792628034.html", "title": "I Make First-Class Wordpress Sites with Training"},
        ...
    ]