Skip to content

Commit 2ba2532

Browse files
committed
Better support for VLArrays in dict and tree stores
1 parent 5926a46 commit 2ba2532

5 files changed

Lines changed: 149 additions & 28 deletions

File tree

src/blosc2/dict_store.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,23 @@
55
# SPDX-License-Identifier: BSD-3-Clause
66
#######################################################################
77

8+
from __future__ import annotations
9+
810
import os
911
import shutil
1012
import tempfile
1113
import zipfile
12-
from collections.abc import Iterator, Set
13-
from typing import Any
14+
from typing import TYPE_CHECKING, Any
1415

1516
import numpy as np
1617

1718
import blosc2
1819
from blosc2.c2array import C2Array
1920
from blosc2.embed_store import EmbedStore
20-
from blosc2.schunk import SChunk
21+
from blosc2.schunk import SChunk, _process_opened_object
22+
23+
if TYPE_CHECKING:
24+
from collections.abc import Iterator, Set
2125

2226

2327
class DictStore:
@@ -244,20 +248,36 @@ def estore(self) -> EmbedStore:
244248
"""Access the underlying EmbedStore."""
245249
return self._estore
246250

247-
def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
251+
@staticmethod
252+
def _value_nbytes(value: blosc2.Array | SChunk | blosc2.VLArray) -> int:
253+
if isinstance(value, blosc2.VLArray):
254+
return value.schunk.nbytes
255+
return value.nbytes
256+
257+
@staticmethod
258+
def _is_external_value(value: blosc2.Array | SChunk | blosc2.VLArray) -> bool:
259+
return isinstance(value, (blosc2.NDArray, SChunk, blosc2.VLArray)) and bool(
260+
getattr(value, "urlpath", None)
261+
)
262+
263+
@staticmethod
264+
def _external_ext(value: blosc2.Array | SChunk | blosc2.VLArray) -> str:
265+
if isinstance(value, blosc2.NDArray):
266+
return ".b2nd"
267+
return ".b2f"
268+
269+
def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None:
248270
"""Add a node to the DictStore."""
249271
if isinstance(value, np.ndarray):
250272
value = blosc2.asarray(value, cparams=self.cparams, dparams=self.dparams)
251273
# C2Array should always go to embed store; let estore handle it directly
252274
if isinstance(value, C2Array):
253275
self._estore[key] = value
254276
return
255-
exceeds_threshold = self.threshold is not None and value.nbytes >= self.threshold
256-
# Consider both NDArray and SChunk external files (have urlpath)
257-
external_file = isinstance(value, (blosc2.NDArray, SChunk)) and getattr(value, "urlpath", None)
277+
exceeds_threshold = self.threshold is not None and self._value_nbytes(value) >= self.threshold
278+
external_file = self._is_external_value(value)
258279
if exceeds_threshold or (external_file and self.threshold is None):
259-
# Choose extension based on type
260-
ext = ".b2f" if isinstance(value, SChunk) else ".b2nd"
280+
ext = self._external_ext(value)
261281
# Convert key to a proper file path within the tree directory
262282
rel_key = key.lstrip("/")
263283
dest_path = os.path.join(self.working_dir, rel_key + ext)
@@ -272,7 +292,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
272292
if hasattr(value, "save"):
273293
value.save(urlpath=dest_path)
274294
else:
275-
# An SChunk does not have a save() method
295+
# SChunk and VLArray can both be persisted via their cframe.
276296
with open(dest_path, "wb") as f:
277297
f.write(value.to_cframe())
278298
else:
@@ -290,20 +310,21 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
290310
value = blosc2.from_cframe(value.to_cframe())
291311
self._estore[key] = value
292312

293-
def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | C2Array:
313+
def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2Array:
294314
"""Retrieve a node from the DictStore."""
295315
# Check map_tree first
296316
if key in self.map_tree:
297317
filepath = self.map_tree[key]
298318
if filepath in self.offsets:
299319
offset = self.offsets[filepath]["offset"]
300-
return blosc2.blosc2_ext.open(
320+
opened = blosc2.blosc2_ext.open(
301321
self.b2z_path,
302322
mode="r",
303323
offset=offset,
304324
mmap_mode=self.mmap_mode,
305325
dparams=self.dparams,
306326
)
327+
return _process_opened_object(opened)
307328
else:
308329
urlpath = os.path.join(self.working_dir, filepath)
309330
if os.path.exists(urlpath):
@@ -319,7 +340,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | C2Array:
319340
# Fall back to EmbedStore
320341
return self._estore[key]
321342

322-
def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | C2Array | Any:
343+
def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | blosc2.VLArray | C2Array | Any:
323344
"""Retrieve a node, or default if not found."""
324345
try:
325346
return self[key]

src/blosc2/embed_store.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,20 @@
55
# SPDX-License-Identifier: BSD-3-Clause
66
#######################################################################
77

8+
from __future__ import annotations
9+
810
import copy
9-
from collections.abc import Iterator, KeysView
10-
from typing import Any
11+
from typing import TYPE_CHECKING, Any
1112

1213
import numpy as np
1314

1415
import blosc2
1516
from blosc2.c2array import C2Array
16-
from blosc2.schunk import SChunk
17+
18+
if TYPE_CHECKING:
19+
from collections.abc import Iterator, KeysView
20+
21+
from blosc2.schunk import SChunk
1722

1823
PROFILE = False # Set to True to enable PROFILE prints in EmbedStore
1924

@@ -168,7 +173,7 @@ def _ensure_capacity(self, needed_bytes: int) -> None:
168173
new_size = max(required_size, int(self._store.shape[0] * 1.5))
169174
self._store.resize((new_size,))
170175

171-
def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
176+
def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None:
172177
"""Add a node to the embed store."""
173178
if self.mode == "r":
174179
raise ValueError("Cannot set items in read-only mode.")
@@ -191,7 +196,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
191196
self._embed_map[key] = {"offset": offset, "length": data_len}
192197
self._save_metadata()
193198

194-
def __getitem__(self, key: str) -> blosc2.NDArray | SChunk:
199+
def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | blosc2.VLArray:
195200
"""Retrieve a node from the embed store."""
196201
if key not in self._embed_map:
197202
raise KeyError(f"Key '{key}' not found in the embed store.")
@@ -207,7 +212,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk:
207212
# Use from_cframe so we can deserialize either an NDArray or an SChunk
208213
return blosc2.from_cframe(serialized_data, copy=True)
209214

210-
def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | Any:
215+
def get(self, key: str, default: Any = None) -> blosc2.NDArray | SChunk | blosc2.VLArray | Any:
211216
"""Retrieve a node, or default if not found."""
212217
return self[key] if key in self._embed_map else default
213218

@@ -234,12 +239,12 @@ def keys(self) -> KeysView[str]:
234239
"""Return all keys."""
235240
return self._embed_map.keys()
236241

237-
def values(self) -> Iterator[blosc2.NDArray | SChunk]:
242+
def values(self) -> Iterator[blosc2.NDArray | SChunk | blosc2.VLArray]:
238243
"""Iterate over all values."""
239244
for key in self._embed_map:
240245
yield self[key]
241246

242-
def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk]]:
247+
def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | blosc2.VLArray]]:
243248
"""Iterate over (key, value) pairs."""
244249
for key in self._embed_map:
245250
yield key, self[key]

src/blosc2/tree_store.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
# SPDX-License-Identifier: BSD-3-Clause
66
#######################################################################
77

8+
from __future__ import annotations
9+
810
import contextlib
911
import os
1012
from collections.abc import Iterator, MutableMapping
@@ -14,11 +16,11 @@
1416

1517
import blosc2
1618
from blosc2.dict_store import DictStore
17-
from blosc2.schunk import SChunk
1819

1920
if TYPE_CHECKING:
2021
from blosc2.c2array import C2Array
2122
from blosc2.ndarray import NDArray
23+
from blosc2.schunk import SChunk
2224

2325

2426
class vlmetaProxy(MutableMapping):
@@ -29,7 +31,7 @@ class vlmetaProxy(MutableMapping):
2931
- Delegates iteration and length to the underlying vlmeta object.
3032
"""
3133

32-
def __init__(self, tstore: "TreeStore", inner_vlmeta):
34+
def __init__(self, tstore: TreeStore, inner_vlmeta):
3335
self._tstore = tstore
3436
self._inner = inner_vlmeta
3537

@@ -224,7 +226,7 @@ def _validate_key(self, key: str) -> str:
224226

225227
return key
226228

227-
def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
229+
def __setitem__(self, key: str, value: blosc2.Array | SChunk | blosc2.VLArray) -> None:
228230
"""Add a node with hierarchical key validation.
229231
230232
Parameters
@@ -266,7 +268,7 @@ def __setitem__(self, key: str, value: blosc2.Array | SChunk) -> None:
266268
full_key = self._translate_key_to_full(key)
267269
super().__setitem__(full_key, value)
268270

269-
def __getitem__(self, key: str) -> "NDArray | C2Array | SChunk | TreeStore":
271+
def __getitem__(self, key: str) -> NDArray | C2Array | SChunk | blosc2.VLArray | TreeStore:
270272
"""Retrieve a node or subtree view.
271273
272274
If the key points to a subtree (intermediate path with children),
@@ -280,7 +282,7 @@ def __getitem__(self, key: str) -> "NDArray | C2Array | SChunk | TreeStore":
280282
281283
Returns
282284
-------
283-
out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or TreeStore
285+
out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.VLArray or TreeStore
284286
The stored array/chunk if key is a leaf node, or a TreeStore subtree view
285287
if key is an intermediate path with children.
286288
@@ -416,7 +418,7 @@ def __iter__(self) -> Iterator[str]:
416418
"""Iterate over keys, excluding vlmeta keys."""
417419
return iter(self.keys())
418420

419-
def items(self) -> Iterator[tuple[str, "NDArray | C2Array | SChunk | TreeStore"]]:
421+
def items(self) -> Iterator[tuple[str, NDArray | C2Array | SChunk | TreeStore]]:
420422
"""Return key-value pairs in the current subtree view."""
421423
for key in self.keys():
422424
yield key, self[key]
@@ -575,7 +577,7 @@ def walk(self, path: str = "/", topdown: bool = True) -> Iterator[tuple[str, lis
575577
# Yield current level after children (post-order)
576578
yield path, children_dirs, leaf_nodes
577579

578-
def get_subtree(self, path: str) -> "TreeStore":
580+
def get_subtree(self, path: str) -> TreeStore:
579581
"""Create a subtree view with the specified path as root.
580582
581583
Parameters

tests/test_dict_store.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,49 @@ def test_external_schunk_file_and_reopen():
223223
os.remove(path)
224224

225225

226+
def test_store_and_retrieve_vlarray_in_dict(tmp_path):
227+
path = tmp_path / "test_dstore_vlarray_embed.b2z"
228+
values = [{"name": "alpha", "count": 1}, None, ("tuple", 2), [1, "two", b"three"]]
229+
230+
vlarray = blosc2.VLArray()
231+
vlarray.extend(values)
232+
233+
with DictStore(str(path), mode="w") as dstore:
234+
dstore["/vlarray"] = vlarray
235+
value = dstore["/vlarray"]
236+
assert isinstance(value, blosc2.VLArray)
237+
assert list(value) == values
238+
239+
with DictStore(str(path), mode="r") as dstore_read:
240+
value = dstore_read["/vlarray"]
241+
assert isinstance(value, blosc2.VLArray)
242+
assert list(value) == values
243+
244+
245+
def test_external_vlarray_file_and_reopen(tmp_path):
246+
ext_path = tmp_path / "ext_vlarray.b2frame"
247+
path = tmp_path / "test_dstore_vlarray_external.b2z"
248+
values = ["alpha", {"nested": True}, None, (1, 2, 3)]
249+
250+
vlarray = blosc2.VLArray(urlpath=str(ext_path), mode="w", contiguous=True)
251+
vlarray.extend(values)
252+
vlarray.vlmeta["description"] = "External VLArray"
253+
254+
with DictStore(str(path), mode="w", threshold=None) as dstore:
255+
dstore["/dir1/vlarray_ext"] = vlarray
256+
assert "/dir1/vlarray_ext" in dstore.map_tree
257+
assert dstore.map_tree["/dir1/vlarray_ext"].endswith(".b2f")
258+
259+
with zipfile.ZipFile(path, "r") as zf:
260+
assert "dir1/vlarray_ext.b2f" in zf.namelist()
261+
262+
with DictStore(str(path), mode="r") as dstore_read:
263+
value = dstore_read["/dir1/vlarray_ext"]
264+
assert isinstance(value, blosc2.VLArray)
265+
assert list(value) == values
266+
assert value.vlmeta["description"] == "External VLArray"
267+
268+
226269
def _digest_value(value):
227270
"""Return a bytes digest of a stored value."""
228271
if isinstance(value, blosc2.SChunk):

tests/test_tree_store.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,56 @@ def test_schunk_support():
604604
os.remove("test_schunk.b2z")
605605

606606

607+
def test_vlarray_support():
608+
"""Test that TreeStore supports embedded VLArray objects."""
609+
values = [{"name": "alpha", "count": 1}, None, ("tuple", 2), [1, "two", b"three"]]
610+
with TreeStore("test_vlarray.b2z", mode="w") as tstore:
611+
vlarray = blosc2.VLArray()
612+
vlarray.extend(values)
613+
tstore["/data/vlarray1"] = vlarray
614+
615+
retrieved = tstore["/data/vlarray1"]
616+
assert isinstance(retrieved, blosc2.VLArray)
617+
assert list(retrieved) == values
618+
619+
data_subtree = tstore["/data"]
620+
assert isinstance(data_subtree, TreeStore)
621+
assert set(data_subtree.keys()) == {"/vlarray1"}
622+
623+
with TreeStore("test_vlarray.b2z", mode="r") as tstore:
624+
retrieved = tstore["/data/vlarray1"]
625+
assert isinstance(retrieved, blosc2.VLArray)
626+
assert list(retrieved) == values
627+
628+
os.remove("test_vlarray.b2z")
629+
630+
631+
def test_external_vlarray_support():
632+
"""Test that TreeStore supports external VLArray objects."""
633+
ext_path = "ext_vlarray.b2frame"
634+
values = ["alpha", {"nested": True}, None, (1, 2, 3)]
635+
if os.path.exists(ext_path):
636+
os.remove(ext_path)
637+
638+
vlarray = blosc2.VLArray(urlpath=ext_path, mode="w", contiguous=True)
639+
vlarray.extend(values)
640+
vlarray.vlmeta["description"] = "External VLArray for TreeStore"
641+
642+
with TreeStore("test_vlarray_external.b2z", mode="w", threshold=None) as tstore:
643+
tstore["/data/vlarray_ext"] = vlarray
644+
assert "/data/vlarray_ext" in tstore
645+
646+
with TreeStore("test_vlarray_external.b2z", mode="r") as tstore:
647+
retrieved = tstore["/data/vlarray_ext"]
648+
assert isinstance(retrieved, blosc2.VLArray)
649+
assert list(retrieved) == values
650+
assert retrieved.vlmeta["description"] == "External VLArray for TreeStore"
651+
652+
if os.path.exists(ext_path):
653+
os.remove(ext_path)
654+
os.remove("test_vlarray_external.b2z")
655+
656+
607657
def test_walk_topdown_argument_ordering():
608658
"""Ensure walk supports topdown argument mimicking os.walk order semantics."""
609659
with TreeStore("test_walk_topdown.b2z", mode="w") as tstore:

0 commit comments

Comments
 (0)