I've tried a bunch of things to be able to read items page by page without loading each page in a list and returning that, which could take too much memory on big pages. I'd like to avoid getting a big list of items just to have to scan the list again to do the post-processing of each item.
So either I get generators that will keep returning empty data and fill the pages
list with an infinite number of empty lists (when using page_from_iterable2
), or I just get the first page (like in page_from_iterable1
.
Any hint on what I am doing wrong?
Thanks.
from typing import Iterable, Iterator
def read_paginated_items(
it: Iterator,
page_size: int,
) -> Iterable:
for _ in range(page_size):
try:
yield next(it)
except StopIteration:
return
def page_from_iterable1(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
page_items_generator = read_paginated_items(it, page_size)
yield page_items_generator
def page_from_iterable2(
iterable: Iterable,
page_size: int,
) -> Iterable:
it = iter(iterable)
while page_items_generator := read_paginated_items(it, page_size):
yield page_items_generator
def test_read_by_page():
pages = []
for page in page_from_iterable1([1, 2, 3, 4, 5], 2):
page_items = [item for item in page]
pages.append(page_items)
assert pages == [[1, 2], [2, 3], [5]]
Thanks to everyone's help, here's what I came up:
from typing import Iterator
import pytest
class PageItems:
def __init__(
self,
iterator: Iterator,
page_size: int,
):
self.items_generator = self._create_items_generator(iterator, page_size)
@staticmethod
def _create_items_generator(
iterator: Iterator,
page_size: int,
):
for _ in range(page_size):
try:
yield next(iterator)
except StopIteration:
return
def __iter__(self):
return self
def __next__(self):
return next(self.items_generator)
def test_read_one_page():
iterable = [1, 2, 3, 4, 5]
page_items = PageItems(iter(iterable), 3)
assert next(page_items) == 1
assert next(page_items) == 2
assert next(page_items) == 3
with pytest.raises(StopIteration):
next(page_items)
def test_read_pages():
iterable = [1, 2, 3, 4, 5]
pages = []
iterator = iter(iterable)
while page_items := list(PageItems(iterator, 2)):
pages.append(page_items)
assert pages == [[1, 2], [3, 4], [5]]
def test_read_pages_modified_items():
iterable = [(1, "A"), (2, "B"), (3, "C"), (4, "D"), (5, "E")]
pages = []
iterator = iter(iterable)
while page_items := [item[0] for item in PageItems(iterator, 2)]:
pages.append(page_items)
assert pages == [[1, 2], [3, 4], [5]]
I won't be able to use a for loop over PageItems
, as it will always spit empty pages in the end, but with a while
I can check for emptyness without having to resort to an ugly if/break
block. This also allows me to either call list(PageItems(iterator, 2))
if I just need the items without modifications, or [item[0] for item in PageItems(iterator, 2)]
it for exemple the items returned are tuples and I just want the first element.