items.py file. as i was aware of the image_urls and images field. It didn't create any problem.
import scrapy
from scrapy.loader.processors import TakeFirst
class BooksToScrapeItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
output_processor = TakeFirst()
)
pipelines.py file. I think there must be a problem in the get_media_request method as it is not fetching the book_name from items file
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class BooksToScrapeImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x,meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
def file_path(self, request, response=None, info=None):
return 'full/%s.jpg' % (request.meta['bookname'])
spider file which I user for scraping.It worked when i didnt customized the pipeline file.
import scrapy
from scrapy.loader import ItemLoader
from books_to_scrape.items import BooksToScrapeItem
class ImgscrapeSpider(scrapy.Spider):
name = 'imgscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
abs_url = response.urljoin(relative_url)
loader.add_value('image_urls',abs_url)
loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()")
yield loader.load_item()
Your problem is in relative xpath
loader.add_xpath('book_name', ".//article[@class='product_pod']/h3/a/text()")
Loader uses xpath("//article[@class='product_pod']")
as selector
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(), selector=article)
so all relative xpaths are relative to "//article[@class='product_pod']"
and they don't need "//article[@class='product_pod']"
in xpath.
Using relative xpath ".//article[@class='product_pod']/h3/a/text()"
it couldn't find title so book_name
was empty for all items and for all items it used None
as title - and it used the same name None.jpg
for all images.
It has to be
loader.add_xpath('book_name', ".//h3/a/text()") # title with `...`
BTW: text()
doesn't have full title but with ...
. To get full title you has to get attribute title=
loader.add_xpath('book_name', ".//h3/a/@title") # full title
I created version with all code in one file to run it without creating project.
Everyone can copy it to single file and run to test it.
import scrapy
from scrapy.loader.processors import TakeFirst
class BooksToScrapeItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
book_name = scrapy.Field(
output_processor = TakeFirst()
)
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
class BooksToScrapeImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [Request(x, meta={'bookname': item.get('book_name')}) for x in item.get(self.images_urls_field, [])] #i think that the problem is in this line
def file_path(self, request, response=None, info=None):
return 'full/%s.jpg' % request.meta['bookname']
from scrapy.loader import ItemLoader
class ImgscrapeSpider(scrapy.Spider):
name = 'imgscrape'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com']
def parse(self, response):
for article in response.xpath("//article[@class='product_pod']"):
loader = ItemLoader(item=BooksToScrapeItem(),selector=article)
relative_url = article.xpath(".//div/a/img[@class='thumbnail']/@src").extract_first()
abs_url = response.urljoin(relative_url)
loader.add_value('image_urls', abs_url)
#loader.add_xpath('book_name',".//article[@class='product_pod']/h3/a/text()") # wrong relative xpath
#loader.add_xpath('book_name', ".//h3/a/text()") # only partial title
loader.add_xpath('book_name', ".//h3/a/@title") # full title
yield loader.load_item()
# -----------------------------------------------------------------------------
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# download images to `IMAGES_STORE/full` (standard folder) and convert to JPG (even if it is already JPG)
# it needs `yield {'image_urls': [url]}` in `parse()` and both ITEM_PIPELINES and IMAGES_STORE to work
'ITEM_PIPELINES': {'__main__.BooksToScrapeImagePipeline': 1}, # used Pipeline create in current file (needs __main___)
'IMAGES_STORE': '.', # this folder has to exist before downloading
})
c.crawl(ImgscrapeSpider)
c.start()