I am trying to scrap data using scrapy. I have got the textual data as needed.But while i tried to scrap image src its returns me exact url in the start and after some records its return
"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
Here is my Spider Code
import scrapy
class CoinmarketcapSpider(scrapy.Spider):
name = 'coinmarketcap'
allowed_domains = ['coinmarketcap.com']
start_urls = ['https://coinmarketcap.com/2']
def parse(self, response):
cointable=response.css('table#currencies').xpath('//tbody/tr')
for coins in cointable:
name=coins.css('a.currency-name-container::text').extract_first().strip()
logo=coins.css('img.logo-sprite::attr(src)').extract()
symbol=coins.css('span.currency-symbol').xpath('.//a/text()').extract_first().strip()
market_cap=coins.css('.market-cap').xpath('text()').extract_first().strip()
yield {
'Name':name,
'image_urls':logo,
'symbol':symbol,
'market_cap':market_cap
}
print response
Here the output json file which have ImgLogo url
[ {"coinName": ["Bitcoin"], "symbol": ["BTC"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/1.png"]}, {"coinName": ["Ethereum"], "symbol": ["ETH"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/1027.png"]}, {"coinName": ["XRP"], "symbol": ["XRP"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/52.png"]}, {"coinName": ["Bitcoin Cash"], "symbol": ["BCH"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/1831.png"]}, {"coinName": ["EOS"], "symbol": ["EOS"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/1765.png"]}, {"coinName": ["Stellar"], "symbol": ["XLM"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/512.png"]}, {"coinName": ["Litecoin"], "symbol": ["LTC"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/2.png"]}, {"coinName": ["Tether"], "symbol": ["USDT"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/825.png"]}, {"coinName": ["Cardano"], "symbol": ["ADA"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/2010.png"]}, {"coinName": ["Monero"], "symbol": ["XMR"], "imgLogo": ["https://s2.coinmarketcap.com/static/img/coins/16x16/328.png"]}, {"coinName": ["IOTA"], "symbol": ["MIOTA"], "imgLogo": ["data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="]}, {"coinName": ["TRON"], "symbol": ["TRX"], "imgLogo": ["data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="]}, {"coinName": ["Dash"], "symbol": ["DASH"], "imgLogo": ["data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="]}]
Here is my Items.py code
import scrapy
class CmindexItem(scrapy.Item):
# define the fields for your item here like:
image_urls = scrapy.Field()
Here is my piplines.py code that download images and save them in my images directory
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class CmindexPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
Here is my settings.py code
BOT_NAME = 'cmindex'
SPIDER_MODULES = ['cmindex.spiders']
NEWSPIDER_MODULE = 'cmindex.spiders'
USER_AGENT = 'cmindex (+http://www.cmindex.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'cmindex.pipelines.CmindexPipeline': 1}
IMAGES_STORE ='E:\WorkPlace\cmindex\cmindex\img'
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
I had added fake user agents and also had added delay in the requests but it didn't affect my end result. So if anyone have idea please share. Thanks
Please remove follwing from your settings.py
USER_AGENT = 'cmindex (+http://www.cmindex.com)'
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
And also Unfollow Robotstxt
ROBOTSTXT_OBEY = False
And please made following changes in your spider file. After viewing the source of coinmarketcap i had found that after 10 rows Src of Image is binary So simple then get Data-src of Image instead of src.
import scrapy
class CoinmarketcapSpider(scrapy.Spider):
name = 'coinmarketcap'
allowed_domains = ['coinmarketcap.com']
start_urls = ['https://coinmarketcap.com/']
def parse(self, response):
cointable=response.css('table#currencies').xpath('//tbody/tr')
for coins in cointable:
name=coins.css('a.currency-name-container::text').extract_first().strip()
logoData = coins.css('img.logo-sprite::attr(data-src)').extract()
logoSrc=coins.css('img.logo-sprite::attr(src)').extract()
if(logoData==[]):
logo=logoSrc
else:
logo=logoData
symbol=coins.css('span.currency-symbol').xpath('.//a/text()').extract_first().strip()
market_cap=coins.css('.market-cap').xpath('text()').extract_first().strip()
yield {
'Name':name,
'image_urls':logo,
'symbol':symbol,
'market_cap':market_cap,
}
print(response)