2121from blosc2 ._msgpack_utils import msgpack_packb , msgpack_unpackb
2222from blosc2 .info import InfoReporter , format_nbytes_info
2323
24- _BATCHSTORE_META = {"version" : 1 , "serializer" : "msgpack" , "max_blocksize " : None , "arrow_schema" : None }
24+ _BATCHSTORE_META = {"version" : 1 , "serializer" : "msgpack" , "items_per_block " : None , "arrow_schema" : None }
2525_SUPPORTED_SERIALIZERS = {"msgpack" , "arrow" }
2626_BATCHSTORE_VLMETA_KEY = "_batch_store_metadata"
2727
@@ -82,9 +82,9 @@ def __getitem__(self, index: int | slice) -> Any | list[Any]:
8282 items = self ._decode_items ()
8383 index = self ._normalize_index (index )
8484 return items [index ]
85- max_blocksize = self ._parent .max_blocksize
86- if max_blocksize is not None :
87- block_index , item_index = divmod (index , max_blocksize )
85+ items_per_block = self ._parent .items_per_block
86+ if items_per_block is not None :
87+ block_index , item_index = divmod (index , items_per_block )
8888 if block_index >= self ._nblocks :
8989 raise IndexError ("Batch index out of range" )
9090 block = self ._get_block (block_index )
@@ -161,9 +161,10 @@ class BatchStore:
161161
162162 Parameters
163163 ----------
164- max_blocksize : int, optional
164+ items_per_block : int, optional
165165 Maximum number of items stored in each internal variable-length block.
166- If not provided, a value is inferred from the first batch.
166+ The last block in a batch may contain fewer items than this cap. If not
167+ provided, a value is inferred from the first batch.
167168 serializer : {"msgpack", "arrow"}, optional
168169 Serializer used for batch payloads. ``"msgpack"`` is the default and is
169170 the general-purpose choice for Python items, including nested Blosc2
@@ -229,7 +230,7 @@ def _attach_schunk(self, schunk: blosc2.SChunk) -> None:
229230 except KeyError :
230231 batchstore_meta = {}
231232 self ._serializer = batchstore_meta .get ("serializer" , self ._serializer )
232- self ._max_blocksize = batchstore_meta .get ("max_blocksize " , self ._max_blocksize )
233+ self ._items_per_block = batchstore_meta .get ("items_per_block " , self ._items_per_block )
233234 self ._arrow_schema = batchstore_meta .get ("arrow_schema" , self ._arrow_schema )
234235 self ._arrow_schema_obj = None
235236 self ._batch_lengths = self ._load_batch_lengths ()
@@ -258,7 +259,7 @@ def _make_storage(self) -> blosc2.Storage:
258259
259260 def __init__ (
260261 self ,
261- max_blocksize : int | None = None ,
262+ items_per_block : int | None = None ,
262263 serializer : str = "msgpack" ,
263264 _from_schunk : blosc2 .SChunk | None = None ,
264265 ** kwargs : Any ,
@@ -269,11 +270,11 @@ def __init__(
269270 mode is ``"r"`` or ``"a"``, the container is reopened automatically.
270271 Otherwise a new empty store is created.
271272 """
272- if max_blocksize is not None and max_blocksize <= 0 :
273- raise ValueError ("max_blocksize must be a positive integer" )
273+ if items_per_block is not None and items_per_block <= 0 :
274+ raise ValueError ("items_per_block must be a positive integer" )
274275 if serializer not in _SUPPORTED_SERIALIZERS :
275276 raise ValueError (f"Unsupported BatchStore serializer: { serializer !r} " )
276- self ._max_blocksize : int | None = max_blocksize
277+ self ._items_per_block : int | None = items_per_block
277278 self ._serializer = serializer
278279 self ._arrow_schema : bytes | None = None
279280 self ._arrow_schema_obj = None
@@ -306,7 +307,7 @@ def __init__(
306307 fixed_meta ["batchstore" ] = {
307308 ** _BATCHSTORE_META ,
308309 "serializer" : self ._serializer ,
309- "max_blocksize " : self ._max_blocksize ,
310+ "items_per_block " : self ._items_per_block ,
310311 "arrow_schema" : self ._arrow_schema ,
311312 }
312313 storage .meta = fixed_meta
@@ -432,10 +433,10 @@ def _get_flat_item(self, index: int | slice) -> Any | list[Any]:
432433 return self [batch_index ][item_index ]
433434
434435 def _block_sizes_from_batch_length (self , batch_length : int , nblocks : int ) -> list [int ]:
435- if self ._max_blocksize is None or nblocks <= 0 :
436+ if self ._items_per_block is None or nblocks <= 0 :
436437 return []
437- full_blocks , remainder = divmod (batch_length , self ._max_blocksize )
438- block_sizes = [self ._max_blocksize ] * full_blocks
438+ full_blocks , remainder = divmod (batch_length , self ._items_per_block )
439+ block_sizes = [self ._items_per_block ] * full_blocks
439440 if remainder :
440441 block_sizes .append (remainder )
441442 if not block_sizes and batch_length > 0 :
@@ -445,7 +446,7 @@ def _block_sizes_from_batch_length(self, batch_length: int, nblocks: int) -> lis
445446 return block_sizes
446447
447448 def _get_block_sizes (self , batch_sizes : list [int ]) -> list [int ] | None :
448- if self ._max_blocksize is None :
449+ if self ._items_per_block is None :
449450 return None
450451 block_sizes : list [int ] = []
451452 for index , batch_length in enumerate (batch_sizes ):
@@ -537,9 +538,9 @@ def _payload_sizes_for_batch(self, batch: Any) -> list[int]:
537538
538539 def _ensure_layout_for_batch (self , batch : Any ) -> None :
539540 layout_changed = False
540- if self ._max_blocksize is None :
541+ if self ._items_per_block is None :
541542 payload_sizes = self ._payload_sizes_for_batch (batch )
542- self ._max_blocksize = self ._guess_blocksize (payload_sizes )
543+ self ._items_per_block = self ._guess_blocksize (payload_sizes )
543544 layout_changed = True
544545 if self ._serializer == "arrow" and self ._arrow_schema is not None :
545546 layout_changed = layout_changed or len (self ) == 0
@@ -555,7 +556,7 @@ def _persist_layout_metadata(self) -> None:
555556 fixed_meta = dict (storage .meta or {})
556557 fixed_meta ["batchstore" ] = {
557558 ** dict (fixed_meta .get ("batchstore" , {})),
558- "max_blocksize " : self ._max_blocksize ,
559+ "items_per_block " : self ._items_per_block ,
559560 "serializer" : self ._serializer ,
560561 "arrow_schema" : self ._arrow_schema ,
561562 }
@@ -640,11 +641,11 @@ def _vl_dparams_kwargs(self) -> dict[str, Any]:
640641 return asdict (self .schunk .dparams )
641642
642643 def _compress_batch (self , batch : Any ) -> bytes :
643- if self ._max_blocksize is None :
644- raise RuntimeError ("BatchStore max_blocksize is not initialized" )
644+ if self ._items_per_block is None :
645+ raise RuntimeError ("BatchStore items_per_block is not initialized" )
645646 blocks = [
646- self ._serialize_block (batch [i : i + self ._max_blocksize ])
647- for i in range (0 , self ._batch_len (batch ), self ._max_blocksize )
647+ self ._serialize_block (batch [i : i + self ._items_per_block ])
648+ for i in range (0 , self ._batch_len (batch ), self ._items_per_block )
648649 ]
649650 return blosc2 .blosc2_ext .vlcompress (blocks , ** self ._vl_cparams_kwargs ())
650651
@@ -823,8 +824,12 @@ def dparams(self):
823824 return self .schunk .dparams
824825
825826 @property
826- def max_blocksize (self ) -> int | None :
827- return self ._max_blocksize
827+ def items_per_block (self ) -> int | None :
828+ """Maximum number of items per internal block.
829+
830+ The last block in a batch may contain fewer items.
831+ """
832+ return self ._items_per_block
828833
829834 @property
830835 def items (self ) -> BatchStoreItems :
@@ -903,7 +908,7 @@ def copy(self, **kwargs: Any) -> BatchStore:
903908 raise ValueError ("meta should not be passed to copy" )
904909 kwargs ["cparams" ] = kwargs .get ("cparams" , copy .deepcopy (self .cparams ))
905910 kwargs ["dparams" ] = kwargs .get ("dparams" , copy .deepcopy (self .dparams ))
906- kwargs ["max_blocksize " ] = kwargs .get ("max_blocksize " , self .max_blocksize )
911+ kwargs ["items_per_block " ] = kwargs .get ("items_per_block " , self .items_per_block )
907912 kwargs ["serializer" ] = kwargs .get ("serializer" , self .serializer )
908913 user_vlmeta = self ._user_vlmeta_items () if len (self .vlmeta ) > 0 else {}
909914
0 commit comments