Skip to content

Commit 0576718

Browse files
committed
Replaced max_blocksize -> items_per_block as it is more meaningful
1 parent 6535606 commit 0576718

5 files changed

Lines changed: 58 additions & 53 deletions

File tree

bench/batch_store.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
NBATCHES = 10_000
2020
OBJECTS_PER_BATCH = 100
2121
TOTAL_OBJECTS = NBATCHES * OBJECTS_PER_BATCH
22-
BLOCKSIZE_MAX = 32
22+
ITEMS_PER_BLOCK = 32
2323
N_RANDOM_READS = 1_000
2424

2525

@@ -64,7 +64,7 @@ def build_store(
6464
storage = blosc2.Storage(mode="w")
6565
store = blosc2.BatchStore(
6666
storage=storage,
67-
max_blocksize=BLOCKSIZE_MAX,
67+
items_per_block=ITEMS_PER_BLOCK,
6868
serializer=serializer,
6969
cparams={
7070
"codec": codec,
@@ -84,7 +84,7 @@ def build_store(
8484
"use_dict": use_dict and codec in (blosc2.Codec.ZSTD, blosc2.Codec.LZ4, blosc2.Codec.LZ4HC),
8585
}
8686
with blosc2.BatchStore(
87-
storage=storage, max_blocksize=BLOCKSIZE_MAX, serializer=serializer, cparams=cparams
87+
storage=storage, items_per_block=ITEMS_PER_BLOCK, serializer=serializer, cparams=cparams
8888
) as store:
8989
for batch_index in range(NBATCHES):
9090
store.append(make_batch(batch_index))
@@ -132,7 +132,7 @@ def main() -> None:
132132
assert store is not None
133133
read_store = store
134134
else:
135-
read_store = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, max_blocksize=BLOCKSIZE_MAX)
135+
read_store = blosc2.BatchStore(urlpath=URLPATH, mode="r", contiguous=True, items_per_block=ITEMS_PER_BLOCK)
136136
samples, timings_ns = measure_random_reads(read_store)
137137
t0 = time.perf_counter()
138138
checksum = 0
@@ -147,7 +147,7 @@ def main() -> None:
147147
print(f" build time: {build_time_s:.3f} s")
148148
print(f" batches: {len(read_store)}")
149149
print(f" items: {TOTAL_OBJECTS}")
150-
print(f" max_blocksize: {read_store.max_blocksize}")
150+
print(f" items_per_block: {read_store.items_per_block}")
151151
print()
152152
print(read_store.info)
153153
print(f"Random scalar reads: {N_RANDOM_READS}")

src/blosc2/batch_store.py

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from blosc2._msgpack_utils import msgpack_packb, msgpack_unpackb
2222
from blosc2.info import InfoReporter, format_nbytes_info
2323

24-
_BATCHSTORE_META = {"version": 1, "serializer": "msgpack", "max_blocksize": None, "arrow_schema": None}
24+
_BATCHSTORE_META = {"version": 1, "serializer": "msgpack", "items_per_block": None, "arrow_schema": None}
2525
_SUPPORTED_SERIALIZERS = {"msgpack", "arrow"}
2626
_BATCHSTORE_VLMETA_KEY = "_batch_store_metadata"
2727

@@ -82,9 +82,9 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]:
8282
items = self._decode_items()
8383
index = self._normalize_index(index)
8484
return items[index]
85-
max_blocksize = self._parent.max_blocksize
86-
if max_blocksize is not None:
87-
block_index, item_index = divmod(index, max_blocksize)
85+
items_per_block = self._parent.items_per_block
86+
if items_per_block is not None:
87+
block_index, item_index = divmod(index, items_per_block)
8888
if block_index >= self._nblocks:
8989
raise IndexError("Batch index out of range")
9090
block = self._get_block(block_index)
@@ -161,9 +161,10 @@ class BatchStore:
161161
162162
Parameters
163163
----------
164-
max_blocksize : int, optional
164+
items_per_block : int, optional
165165
Maximum number of items stored in each internal variable-length block.
166-
If not provided, a value is inferred from the first batch.
166+
The last block in a batch may contain fewer items than this cap. If not
167+
provided, a value is inferred from the first batch.
167168
serializer : {"msgpack", "arrow"}, optional
168169
Serializer used for batch payloads. ``"msgpack"`` is the default and is
169170
the general-purpose choice for Python items, including nested Blosc2
@@ -229,7 +230,7 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None:
229230
except KeyError:
230231
batchstore_meta = {}
231232
self._serializer = batchstore_meta.get("serializer", self._serializer)
232-
self._max_blocksize = batchstore_meta.get("max_blocksize", self._max_blocksize)
233+
self._items_per_block = batchstore_meta.get("items_per_block", self._items_per_block)
233234
self._arrow_schema = batchstore_meta.get("arrow_schema", self._arrow_schema)
234235
self._arrow_schema_obj = None
235236
self._batch_lengths = self._load_batch_lengths()
@@ -258,7 +259,7 @@ def _make_storage(self) -> blosc2.Storage:
258259

259260
def __init__(
260261
self,
261-
max_blocksize: int | None = None,
262+
items_per_block: int | None = None,
262263
serializer: str = "msgpack",
263264
_from_schunk: blosc2.SChunk | None = None,
264265
**kwargs: Any,
@@ -269,11 +270,11 @@ def __init__(
269270
mode is ``"r"`` or ``"a"``, the container is reopened automatically.
270271
Otherwise a new empty store is created.
271272
"""
272-
if max_blocksize is not None and max_blocksize <= 0:
273-
raise ValueError("max_blocksize must be a positive integer")
273+
if items_per_block is not None and items_per_block <= 0:
274+
raise ValueError("items_per_block must be a positive integer")
274275
if serializer not in _SUPPORTED_SERIALIZERS:
275276
raise ValueError(f"Unsupported BatchStore serializer: {serializer!r}")
276-
self._max_blocksize: int | None = max_blocksize
277+
self._items_per_block: int | None = items_per_block
277278
self._serializer = serializer
278279
self._arrow_schema: bytes | None = None
279280
self._arrow_schema_obj = None
@@ -306,7 +307,7 @@ def __init__(
306307
fixed_meta["batchstore"] = {
307308
**_BATCHSTORE_META,
308309
"serializer": self._serializer,
309-
"max_blocksize": self._max_blocksize,
310+
"items_per_block": self._items_per_block,
310311
"arrow_schema": self._arrow_schema,
311312
}
312313
storage.meta = fixed_meta
@@ -432,10 +433,10 @@ def _get_flat_item(self, index: int | slice) -> Any | list[Any]:
432433
return self[batch_index][item_index]
433434

434435
def _block_sizes_from_batch_length(self, batch_length: int, nblocks: int) -> list[int]:
435-
if self._max_blocksize is None or nblocks <= 0:
436+
if self._items_per_block is None or nblocks <= 0:
436437
return []
437-
full_blocks, remainder = divmod(batch_length, self._max_blocksize)
438-
block_sizes = [self._max_blocksize] * full_blocks
438+
full_blocks, remainder = divmod(batch_length, self._items_per_block)
439+
block_sizes = [self._items_per_block] * full_blocks
439440
if remainder:
440441
block_sizes.append(remainder)
441442
if not block_sizes and batch_length > 0:
@@ -445,7 +446,7 @@ def _block_sizes_from_batch_length(self, batch_length: int, nblocks: int) -> lis
445446
return block_sizes
446447

447448
def _get_block_sizes(self, batch_sizes: list[int]) -> list[int] | None:
448-
if self._max_blocksize is None:
449+
if self._items_per_block is None:
449450
return None
450451
block_sizes: list[int] = []
451452
for index, batch_length in enumerate(batch_sizes):
@@ -537,9 +538,9 @@ def _payload_sizes_for_batch(self, batch: Any) -> list[int]:
537538

538539
def _ensure_layout_for_batch(self, batch: Any) -> None:
539540
layout_changed = False
540-
if self._max_blocksize is None:
541+
if self._items_per_block is None:
541542
payload_sizes = self._payload_sizes_for_batch(batch)
542-
self._max_blocksize = self._guess_blocksize(payload_sizes)
543+
self._items_per_block = self._guess_blocksize(payload_sizes)
543544
layout_changed = True
544545
if self._serializer == "arrow" and self._arrow_schema is not None:
545546
layout_changed = layout_changed or len(self) == 0
@@ -555,7 +556,7 @@ def _persist_layout_metadata(self) -> None:
555556
fixed_meta = dict(storage.meta or {})
556557
fixed_meta["batchstore"] = {
557558
**dict(fixed_meta.get("batchstore", {})),
558-
"max_blocksize": self._max_blocksize,
559+
"items_per_block": self._items_per_block,
559560
"serializer": self._serializer,
560561
"arrow_schema": self._arrow_schema,
561562
}
@@ -640,11 +641,11 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]:
640641
return asdict(self.schunk.dparams)
641642

642643
def _compress_batch(self, batch: Any) -> bytes:
643-
if self._max_blocksize is None:
644-
raise RuntimeError("BatchStore max_blocksize is not initialized")
644+
if self._items_per_block is None:
645+
raise RuntimeError("BatchStore items_per_block is not initialized")
645646
blocks = [
646-
self._serialize_block(batch[i : i + self._max_blocksize])
647-
for i in range(0, self._batch_len(batch), self._max_blocksize)
647+
self._serialize_block(batch[i : i + self._items_per_block])
648+
for i in range(0, self._batch_len(batch), self._items_per_block)
648649
]
649650
return blosc2.blosc2_ext.vlcompress(blocks, **self._vl_cparams_kwargs())
650651

@@ -823,8 +824,12 @@ def dparams(self):
823824
return self.schunk.dparams
824825

825826
@property
826-
def max_blocksize(self) -> int | None:
827-
return self._max_blocksize
827+
def items_per_block(self) -> int | None:
828+
"""Maximum number of items per internal block.
829+
830+
The last block in a batch may contain fewer items.
831+
"""
832+
return self._items_per_block
828833

829834
@property
830835
def items(self) -> BatchStoreItems:
@@ -903,7 +908,7 @@ def copy(self, **kwargs: Any) -> BatchStore:
903908
raise ValueError("meta should not be passed to copy")
904909
kwargs["cparams"] = kwargs.get("cparams", copy.deepcopy(self.cparams))
905910
kwargs["dparams"] = kwargs.get("dparams", copy.deepcopy(self.dparams))
906-
kwargs["max_blocksize"] = kwargs.get("max_blocksize", self.max_blocksize)
911+
kwargs["items_per_block"] = kwargs.get("items_per_block", self.items_per_block)
907912
kwargs["serializer"] = kwargs.get("serializer", self.serializer)
908913
user_vlmeta = self._user_vlmeta_items() if len(self.vlmeta) > 0 else {}
909914

tests/test_batch_store.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def _make_nested_blosc2_objects():
3737
nested_vlarray = blosc2.VLArray()
3838
nested_vlarray.extend(["alpha", {"beta": 2}])
3939

40-
nested_batchstore = blosc2.BatchStore(max_blocksize=2)
40+
nested_batchstore = blosc2.BatchStore(items_per_block=2)
4141
nested_batchstore.extend([[1, 2], ["x", {"y": 3}]])
4242

4343
estore = blosc2.EmbedStore()
@@ -65,8 +65,8 @@ def test_batchstore_roundtrip(contiguous, urlpath):
6565
assert barray.append(batch) == i
6666

6767
assert len(barray) == len(BATCHES)
68-
assert barray.max_blocksize is not None
69-
assert 1 <= barray.max_blocksize <= len(BATCHES[0])
68+
assert barray.items_per_block is not None
69+
assert 1 <= barray.items_per_block <= len(BATCHES[0])
7070
assert [batch[:] for batch in barray] == BATCHES
7171
assert barray.append([1, 2]) == len(BATCHES) + 1
7272
assert [batch[:] for batch in barray][-1] == [1, 2]
@@ -102,7 +102,7 @@ def test_batchstore_roundtrip(contiguous, urlpath):
102102
if urlpath is not None:
103103
reopened = blosc2.open(urlpath, mode="r")
104104
assert isinstance(reopened, blosc2.BatchStore)
105-
assert reopened.max_blocksize == barray.max_blocksize
105+
assert reopened.items_per_block == barray.items_per_block
106106
assert [batch[:] for batch in reopened] == expected
107107
with pytest.raises(ValueError):
108108
reopened.append(["nope"])
@@ -203,7 +203,7 @@ def test_batchstore_from_cframe():
203203
def test_batchstore_msgpack_supports_blosc2_objects():
204204
ndarray, schunk, nested_vlarray, nested_batchstore, estore = _make_nested_blosc2_objects()
205205

206-
barray = blosc2.BatchStore(max_blocksize=2)
206+
barray = blosc2.BatchStore(items_per_block=2)
207207
barray.append([ndarray, schunk, nested_vlarray, nested_batchstore, estore])
208208

209209
restored = barray[0][:]
@@ -277,15 +277,15 @@ def fail_decode(*args, **kwargs):
277277

278278

279279
def test_batchstore_info_reports_exact_block_stats_from_lazy_chunks():
280-
barray = blosc2.BatchStore(max_blocksize=2)
280+
barray = blosc2.BatchStore(items_per_block=2)
281281
barray.extend([[1, 2, 3, 4, 5], [6, 7], [8]])
282282

283283
items = dict(barray.info_items)
284284
assert items["nblocks"] == "5 (items per block: mean=1.60, max=2, min=1)"
285285

286286

287287
def test_batchstore_pop_keeps_batch_lengths_metadata_in_sync():
288-
barray = blosc2.BatchStore(max_blocksize=2)
288+
barray = blosc2.BatchStore(items_per_block=2)
289289
barray.extend([[1, 2, 3], [4, 5], [6]])
290290

291291
removed = barray.pop(1)
@@ -335,9 +335,9 @@ def test_batchstore_zstd_does_not_use_dict_by_default():
335335
assert barray.cparams.use_dict is False
336336

337337

338-
def test_batchstore_explicit_max_blocksize():
339-
barray = blosc2.BatchStore(max_blocksize=2)
340-
assert barray.max_blocksize == 2
338+
def test_batchstore_explicit_items_per_block():
339+
barray = blosc2.BatchStore(items_per_block=2)
340+
assert barray.items_per_block == 2
341341
barray.append([1, 2, 3])
342342
barray.append([4])
343343
assert [batch[:] for batch in barray] == [[1, 2, 3], [4]]
@@ -348,10 +348,10 @@ def test_batchstore_get_vlblock_and_scalar_access():
348348
blosc2.remove_urlpath(urlpath)
349349

350350
batch = [0, 1, 2, 3, 4]
351-
barray = blosc2.BatchStore(storage=_storage(True, urlpath), max_blocksize=2)
351+
barray = blosc2.BatchStore(storage=_storage(True, urlpath), items_per_block=2)
352352
barray.append(batch)
353353

354-
assert barray.max_blocksize == 2
354+
assert barray.items_per_block == 2
355355
assert msgpack_unpackb(barray.schunk.get_vlblock(0, 0)) == batch[:2]
356356
assert msgpack_unpackb(barray.schunk.get_vlblock(0, 1)) == batch[2:4]
357357
assert msgpack_unpackb(barray.schunk.get_vlblock(0, 2)) == batch[4:]
@@ -362,7 +362,7 @@ def test_batchstore_get_vlblock_and_scalar_access():
362362

363363
reopened = blosc2.open(urlpath, mode="r")
364364
assert isinstance(reopened, blosc2.BatchStore)
365-
assert reopened.max_blocksize == 2
365+
assert reopened.items_per_block == 2
366366
assert reopened[0][0] == 0
367367
assert reopened[0][2] == 2
368368
assert reopened[0][4] == 4
@@ -372,7 +372,7 @@ def test_batchstore_get_vlblock_and_scalar_access():
372372

373373

374374
def test_batchstore_scalar_reads_cache_vlblocks():
375-
barray = blosc2.BatchStore(max_blocksize=2)
375+
barray = blosc2.BatchStore(items_per_block=2)
376376
barray.append([0, 1, 2, 3, 4])
377377

378378
batch = barray[0]
@@ -396,7 +396,7 @@ def wrapped_get_vlblock(nchunk, nblock):
396396

397397

398398
def test_batchstore_iter_items():
399-
barray = blosc2.BatchStore(max_blocksize=2)
399+
barray = blosc2.BatchStore(items_per_block=2)
400400
batches = [[1, 2, 3], [4], [5, 6]]
401401
barray.extend(batches)
402402

@@ -424,21 +424,21 @@ def test_batchstore_respects_explicit_use_dict_and_non_zstd():
424424
assert barray.cparams.use_dict is False
425425

426426

427-
def test_batchstore_guess_max_blocksize_uses_l2_for_clevel_5(monkeypatch):
427+
def test_batchstore_guess_items_per_block_uses_l2_for_clevel_5(monkeypatch):
428428
monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100)
429429
monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 1000)
430430
barray = blosc2.BatchStore(cparams={"clevel": 5})
431431
assert barray._guess_blocksize([30, 30, 30, 30]) == 4
432432

433433

434-
def test_batchstore_guess_max_blocksize_uses_l2_for_mid_clevel(monkeypatch):
434+
def test_batchstore_guess_items_per_block_uses_l2_for_mid_clevel(monkeypatch):
435435
monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 100)
436436
monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 150)
437437
barray = blosc2.BatchStore(cparams={"clevel": 6})
438438
assert barray._guess_blocksize([60, 60, 60, 60]) == 2
439439

440440

441-
def test_batchstore_guess_max_blocksize_uses_full_batch_for_clevel_9(monkeypatch):
441+
def test_batchstore_guess_items_per_block_uses_full_batch_for_clevel_9(monkeypatch):
442442
monkeypatch.setitem(blosc2.cpu_info, "l1_data_cache_size", 1)
443443
monkeypatch.setitem(blosc2.cpu_info, "l2_cache_size", 1)
444444
barray = blosc2.BatchStore(cparams={"clevel": 9})
@@ -596,7 +596,7 @@ def test_batchstore_items_accessor(contiguous, urlpath):
596596

597597
batches = [["a", "b"], [10, 11, 12], [{"x": 1}], [None, True]]
598598
flat = [item for batch in batches for item in batch]
599-
barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath), max_blocksize=2)
599+
barray = blosc2.BatchStore(storage=_storage(contiguous, urlpath), items_per_block=2)
600600
barray.extend(batches)
601601

602602
assert len(barray.items) == len(flat)

tests/test_tree_store.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -675,7 +675,7 @@ def test_external_batchstore_support(tmp_path):
675675
store_path = tmp_path / "test_batchstore_external.b2d"
676676

677677
with TreeStore(str(store_path), mode="w", threshold=0) as tstore:
678-
bstore = blosc2.BatchStore(max_blocksize=2)
678+
bstore = blosc2.BatchStore(items_per_block=2)
679679
bstore.extend([[{"id": 1}, {"id": 2}], [{"id": 3}]])
680680
tstore["/data/batchstore"] = bstore
681681

@@ -693,7 +693,7 @@ def test_metadata_discovery_reopens_renamed_batchstore_leaf(storage_type, tmp_pa
693693
store_path = tmp_path / f"test_batchstore_renamed.{storage_type}"
694694

695695
with TreeStore(str(store_path), mode="w", threshold=0) as tstore:
696-
bstore = blosc2.BatchStore(max_blocksize=2)
696+
bstore = blosc2.BatchStore(items_per_block=2)
697697
bstore.extend([[{"id": 1}, {"id": 2}], [{"id": 3}]])
698698
tstore["/data/batchstore"] = bstore
699699

tests/test_vlarray.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def _make_nested_blosc2_objects():
3636
nested_vlarray = blosc2.VLArray()
3737
nested_vlarray.extend(["alpha", {"beta": 2}])
3838

39-
nested_batchstore = blosc2.BatchStore(max_blocksize=2)
39+
nested_batchstore = blosc2.BatchStore(items_per_block=2)
4040
nested_batchstore.extend([[1, 2], ["x", {"y": 3}]])
4141

4242
estore = blosc2.EmbedStore()

0 commit comments

Comments
 (0)