Currently working with Scrapy.
I have a list of URLs stored in a MySQL database. A spider visits these URLs, capture two target info (score and count). My goal is that when Scrapy finish scraping, it automatically populates the corresponding columns before it moves on to the next URL.
I'm a newbie and I can't seem to get the saving part to work correctly. The score and count are passed on to the database successfully. But it's saved as new rows instead of being associated to the source URL.
Here's my code: amazon_spider.py
import scrapy
from whatoplaybot.items import crawledScore
import MySQLdb
class amazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ["amazon.com"]
start_urls = []
def parse(self, response):
print self.start_urls
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
cursor = conn.cursor()
cursor.execute(
'SELECT url FROM scraped;'
)
rows = cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[0])
conn.close()
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
yield item
pipelines.py
import sys
import MySQLdb
class storeScore(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='root',
passwd='',
db='scraper',
host='127.0.0.1',
charset="utf8",
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count) VALUES (%s, %s)""", (item['reviewScore'], item['reviewCount']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
Any help and guidance will be very much appreciated.
Thank you, guys.
Follow this steps:
Add a reviewURL field into your crawledScore Item:
class crawledScore(scrapy.Item):
reviewScore = scrapy.Field()
reviewCount = scrapy.Field()
reviewURL = scrapy.Field()
Save response url into item['reviewURL']:
def parse(self, response):
item = crawledScore()
item['reviewScore'] = response.xpath('//*[@id="avgRating"]/span/a/span/text()').re("[0-9,.]+")[0]
item['reviewCount'] = response.xpath('//*[@id="summaryStars"]/a/text()').re("[0-9,]+")
item['reviewURL'] = response.url
yield item
Finally, on your pipelines file, insert or update depending on your logic:
INSERT:
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO scraped(score, count, url) VALUES (%s, %s, %s)""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
UPDATE:
def process_item(self, item, spider):
try:
self.cursor.execute("""UPDATE scraped SET score=%s, count=%s WHERE url=%s""", (item['reviewScore'], item['reviewCount'], item['reviewURL']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item