I'm trying to create a custom pipeline for a Scrapy
project that outputs the collected items to CSV files. In order to keep each file's size down I want to set a maximum number of rows that each file can have. Once the line limit has been reached in the current file a new file is created to continue outputting the items.
Luckily, I found a question where someone was looking to do the same thing. And there's an answer to that question that shows an example implementation.
I implemented the example implementation, but tweaked the way stats
were accessed to align with the current version of Scrapy
.
from scrapy.exporters import CsvItemExporter
import datetime
class PartitionedCsvPipeline(object):
def __init__(self, stats):
self.stats = stats
self.stats.set_value('item_scraped_count', 0)
self.base_filename = "site_{}.csv"
self.next_split = self.split_limit = 100
self.create_exporter()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def create_exporter(self):
now = datetime.datetime.now()
datetime_stamp = now.strftime("%Y%m%d%H%M")
self.file = open(self.base_filename.format(datetime_stamp),'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def process_item(self, item, spider):
if self.stats.get_value('item_scraped_count') >= self.next_split:
self.next_split += self.split_limit
self.exporter.finish_exporting()
self.file.close()
self.create_exporter()
self.exporter.export_item(item)
self.stats.inc_value('item_scraped_count')
return item
The pipeline does result in multiple files being output, but the files all have only 50 items instead of the 100 that's expected.
What am I doing wrong that's making the files half the size that's expected?
When in process_item()
I add
print('>>> stat count:', self.stats.get_value('item_scraped_count'))
and remove
self.stats.inc_value('item_scraped_count')
then I see it still increases this variable.
It means other code already counts scraped values so you shouldn't increase it.
If I keep inc_value()
then I see it counts all elements two times.
I'm not sure if it counts only elements which you add to CSV so you could use separated variable to count it
class PartitionedCsvPipeline(object):
def __init__(self, stats):
self.count = 0
# ... code ...
def process_item(self, item, spider):
print('>>> count:', self.count)
if self.count >= self.next_split:
# ... code ...
# ... code ...
self.count += 1
return item
EDIT:
Pipeline needs this method to close last file and save all data in this file.
def close_spider(self, spider):
self.file.close()
Minimal working example.
I put all in one file and it can be run python script.py
without creating project. This way everyone can easily test it.
Because I scrape 10 items in every file so it created new file so fast that I had to add microseconds (%f
) to filename to create unique names.
import scrapy
from scrapy.exporters import CsvItemExporter
import datetime
class MySpider(scrapy.Spider):
name = 'myspider'
# see page created for scraping: http://toscrape.com/
start_urls = ['http://books.toscrape.com/'] #'http://quotes.toscrape.com']
def parse(self, response):
print('url:', response.url)
# download images and convert to JPG (even if it is already JPG)
for url in response.css('img::attr(src)').extract():
url = response.urljoin(url)
yield {'image_urls': [url], 'session_path': 'hello_world'}
class PartitionedCsvPipeline(object):
def __init__(self, stats):
self.filename = "site_{}.csv"
self.split_limit = 10
self.count = 0
self.create_exporter()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def create_exporter(self):
now = datetime.datetime.now()
datetime_stamp = now.strftime("%Y.%m.%d-%H.%M.%S.%f") # %f for microseconds because sometimes it can create next file in less then 1 second and create the same name.
self.file = open(self.filename.format(datetime_stamp), 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def finish_exporter(self):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
if self.count >= self.split_limit:
self.finish_exporter()
self.count = 0
self.create_exporter()
self.exporter.export_item(item)
self.count += 1
print('self.count:', self.count)
return item
def close_spider(self, spider):
self.finish_exporter()
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'ITEM_PIPELINES': {'__main__.PartitionedCsvPipeline': 1}, # used Pipeline create in current file (needs __main___)
})
c.crawl(MySpider)
c.start()