Search code examples
pythoniteratorgenerator

How to read an iterable page by page?


I've tried a bunch of things to be able to read items page by page without loading each page in a list and returning that, which could take too much memory on big pages. I'd like to avoid getting a big list of items just to have to scan the list again to do the post-processing of each item.

So either I get generators that will keep returning empty data and fill the pages list with an infinite number of empty lists (when using page_from_iterable2), or I just get the first page (like in page_from_iterable1.

Any hint on what I am doing wrong?

Thanks.

from typing import Iterable, Iterator


def read_paginated_items(
    it: Iterator,
    page_size: int,
) -> Iterable:
    for _ in range(page_size):
        try:
            yield next(it)
        except StopIteration:
            return


def page_from_iterable1(
    iterable: Iterable,
    page_size: int,
) -> Iterable:
    it = iter(iterable)
    page_items_generator = read_paginated_items(it, page_size)
    yield page_items_generator


def page_from_iterable2(
    iterable: Iterable,
    page_size: int,
) -> Iterable:
    it = iter(iterable)
    while page_items_generator := read_paginated_items(it, page_size):
        yield page_items_generator
    

def test_read_by_page():
    pages = []
    for page in page_from_iterable1([1, 2, 3, 4, 5], 2):
        page_items = [item for item in page]
        pages.append(page_items)

    assert pages == [[1, 2], [2, 3], [5]]

Solution

  • Thanks to everyone's help, here's what I came up:

    from typing import Iterator
    
    import pytest
    
    
    class PageItems:
        def __init__(
            self,
            iterator: Iterator,
            page_size: int,
        ):
            self.items_generator = self._create_items_generator(iterator, page_size)
    
        @staticmethod
        def _create_items_generator(
            iterator: Iterator,
            page_size: int,
        ):
            for _ in range(page_size):
                try:
                    yield next(iterator)
                except StopIteration:
                    return
    
        def __iter__(self):
            return self
    
        def __next__(self):
            return next(self.items_generator)
    
    
    def test_read_one_page():
        iterable = [1, 2, 3, 4, 5]
    
        page_items = PageItems(iter(iterable), 3)
        assert next(page_items) == 1
        assert next(page_items) == 2
        assert next(page_items) == 3
    
        with pytest.raises(StopIteration):
            next(page_items)
    
    
    def test_read_pages():
        iterable = [1, 2, 3, 4, 5]
        pages = []
        iterator = iter(iterable)
    
        while page_items := list(PageItems(iterator, 2)):
            pages.append(page_items)
    
        assert pages == [[1, 2], [3, 4], [5]]
    
    def test_read_pages_modified_items():
        iterable = [(1, "A"), (2, "B"), (3, "C"), (4, "D"), (5, "E")]
        pages = []
        iterator = iter(iterable)
    
        while page_items := [item[0] for item in PageItems(iterator, 2)]:
            pages.append(page_items)
    
        assert pages == [[1, 2], [3, 4], [5]]
    

    I won't be able to use a for loop over PageItems, as it will always spit empty pages in the end, but with a while I can check for emptyness without having to resort to an ugly if/break block. This also allows me to either call list(PageItems(iterator, 2)) if I just need the items without modifications, or [item[0] for item in PageItems(iterator, 2)] it for exemple the items returned are tuples and I just want the first element.