Search code examples
pythonmysqlscrapyscreen-scraping

Saving Scrapy data to corresponding URL in MySQL


Currently working with Scrapy.

I have a list of URLs stored in a MySQL database. A spider visits these URLs, capture two target info (score and count). My goal is that when Scrapy finish scraping, it automatically populates the corresponding columns before it moves on to the next URL.

I'm a newbie and I can't seem to get the saving part to work correctly. The score and count are passed on to the database successfully. But it's saved as new rows instead of being associated to the source URL.

Here's my code: amazon_spider.py

import scrapy
from whatoplaybot.items import crawledScore
import MySQLdb

class amazonSpider(scrapy.Spider):
    name = "amazon"
    allowed_domains = ["amazon.com"]
    start_urls = []

    def parse(self, response):
        print self.start_urls

    def start_requests(self):
        conn = MySQLdb.connect(
                user='root',
                passwd='',
                db='scraper',
                host='127.0.0.1',
                charset="utf8",
                use_unicode=True
                )
        cursor = conn.cursor()
        cursor.execute(
            'SELECT url FROM scraped;'
            )

        rows = cursor.fetchall()

        for row in rows:
            yield self.make_requests_from_url(row[0])
        conn.close()

    def parse(self, response):
        item = crawledScore()
        item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
        item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
        yield item

pipelines.py

import sys
import MySQLdb

class storeScore(object):
    def __init__(self):
        self.conn = MySQLdb.connect(
            user='root',
            passwd='',
            db='scraper',
            host='127.0.0.1',
            charset="utf8",
            use_unicode=True
        )
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""INSERT INTO scraped(score, count) VALUES (%s, %s)""", (item['reviewScore'], item['reviewCount']))
            self.conn.commit()

        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

            return item

Any help and guidance will be very much appreciated.

Thank you, guys.


Solution

  • Follow this steps:

    Add a reviewURL field into your crawledScore Item:

    class crawledScore(scrapy.Item):
        reviewScore = scrapy.Field()
        reviewCount = scrapy.Field()
        reviewURL = scrapy.Field()
    

    Save response url into item['reviewURL']:

    def parse(self, response):
        item = crawledScore()
        item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
        item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
        item['reviewURL'] = response.url
        yield item
    

    Finally, on your pipelines file, insert or update depending on your logic:

    INSERT:

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""INSERT INTO scraped(score, count, url) VALUES (%s, %s, %s)""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
            self.conn.commit()
        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
    
            return item
    

    UPDATE:

    def process_item(self, item, spider):
            try:
                self.cursor.execute("""UPDATE scraped SET score=%s, count=%s WHERE url=%s""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
                self.conn.commit()
            except MySQLdb.Error, e:
                print "Error %d: %s" % (e.args[0], e.args[1])
    
                return item