Skip to content

Commit 0edae8b

Browse files
committed
New Ref object for serialzing external references
1 parent 1a4b3b5 commit 0edae8b

10 files changed

Lines changed: 228 additions & 67 deletions

File tree

doc/reference/classes.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Main Classes
1818
EmbedStore
1919
BatchStore
2020
VLArray
21+
Ref
2122
Proxy
2223
ProxySource
2324
ProxyNDSource
@@ -37,6 +38,7 @@ Main Classes
3738
embed_store
3839
batch_store
3940
vlarray
41+
ref
4042
proxy
4143
proxysource
4244
proxyndsource
@@ -58,4 +60,5 @@ Other Classes
5860
Storage
5961
Tuner
6062
URLPath
63+
Ref
6164
FPAccuracy

doc/reference/schunk.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,27 @@ SChunk
55

66
The basic compressed data container (aka super-chunk). This class consists of a set of useful parameters and methods that allow not only to create compressed data, and decompress it, but also to manage the data in a more sophisticated way. For example, it is possible to append new data, update existing data, delete data, etc.
77

8+
Metadata support
9+
----------------
10+
11+
``SChunk.vlmeta`` uses the general Blosc2 msgpack extensions; see
12+
:ref:`Msgpack Serialization <MsgpackSerialization>`. This means
13+
variable-length metadata can store not only ordinary msgpack-safe Python
14+
values, but also the currently supported Blosc2 objects and references,
15+
including:
16+
17+
- ``NDArray``, ``SChunk``, ``VLArray``, ``BatchStore``, ``EmbedStore``
18+
- ``Ref``
19+
- ``C2Array``
20+
- ``LazyExpr``
21+
- ``LazyUDF`` backed by ``@blosc2.dsl_kernel``
22+
23+
Both single-key access (``schunk.vlmeta["name"]``) and bulk access
24+
(``schunk.vlmeta[:]``) use this serializer.
25+
26+
Lazy expressions and supported lazy UDFs still require durable operand
27+
references only; purely in-memory operands are intentionally rejected.
28+
829
.. currentmodule:: blosc2
930

1031
.. autoclass:: SChunk

src/blosc2/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ def _raise(exc):
537537
from .tree_store import TreeStore
538538
from .batch_store import Batch, BatchStore
539539
from .vlarray import VLArray, vlarray_from_cframe
540+
from .ref import Ref
540541

541542
from .c2array import c2context, C2Array, URLPath
542543

@@ -740,6 +741,7 @@ def _raise(exc):
740741
"ProxyNDField",
741742
"ProxyNDSource",
742743
"ProxySource",
744+
"Ref",
743745
"SChunk",
744746
"SimpleProxy",
745747
"SpecialValue",

src/blosc2/_msgpack_utils.py

Lines changed: 9 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from blosc2 import blosc2_ext
2020
from blosc2.dsl_kernel import DSLKernel
21+
from blosc2.ref import Ref
2122

2223
# Msgpack extension type codes are application-defined. Reserve code 42 in
2324
# python-blosc2 for values serialized as Blosc2 CFrames via ``to_cframe()`` and
@@ -33,43 +34,15 @@
3334

3435

3536
def _encode_operand_reference(obj):
36-
import blosc2
37-
38-
if isinstance(obj, blosc2.C2Array):
39-
return {
40-
"kind": "c2array",
41-
"version": _BLOSC2_STRUCTURED_VERSION,
42-
"path": obj.path,
43-
"urlbase": obj.urlbase,
44-
}
45-
if isinstance(obj, blosc2.Proxy):
46-
obj = obj._cache
47-
dictstore_urlpath = getattr(obj, "_msgpack_dictstore_urlpath", None)
48-
dictstore_key = getattr(obj, "_msgpack_dictstore_key", None)
49-
if isinstance(dictstore_urlpath, str) and isinstance(dictstore_key, str):
50-
return {
51-
"kind": "dictstore_key",
52-
"version": _BLOSC2_STRUCTURED_VERSION,
53-
"urlpath": dictstore_urlpath,
54-
"key": dictstore_key,
55-
}
56-
if hasattr(obj, "schunk"):
57-
urlpath = obj.schunk.urlpath
58-
if urlpath is None:
59-
raise ValueError(
60-
"Structured Blosc2 msgpack payload requires operands to be stored on disk/network"
61-
)
62-
return {
63-
"kind": "urlpath",
64-
"version": _BLOSC2_STRUCTURED_VERSION,
65-
"urlpath": urlpath,
66-
}
67-
raise TypeError("Structured Blosc2 msgpack payload requires NDArray, C2Array, or Proxy operands")
37+
return Ref.from_object(obj).to_dict()
6838

6939

7040
def _encode_structured_reference(obj):
7141
import blosc2
7242

43+
if isinstance(obj, blosc2.Ref):
44+
payload = {"kind": "ref", "version": _BLOSC2_STRUCTURED_VERSION, "ref": obj.to_dict()}
45+
return ExtType(_BLOSC2_STRUCTURED_EXT_CODE, packb(payload, use_bin_type=True))
7346
if isinstance(obj, blosc2.C2Array):
7447
payload = _encode_operand_reference(obj)
7548
return ExtType(_BLOSC2_STRUCTURED_EXT_CODE, packb(payload, use_bin_type=True))
@@ -124,38 +97,7 @@ def _encode_structured_reference(obj):
12497

12598

12699
def _decode_operand_reference(payload):
127-
import blosc2
128-
129-
if not isinstance(payload, dict):
130-
raise TypeError("Structured Blosc2 msgpack payload must decode to a mapping")
131-
132-
version = payload.get("version")
133-
if version != _BLOSC2_STRUCTURED_VERSION:
134-
raise ValueError(f"Unsupported structured Blosc2 msgpack payload version: {version!r}")
135-
136-
kind = payload.get("kind")
137-
if kind == "c2array":
138-
path = payload.get("path")
139-
if not isinstance(path, str):
140-
raise TypeError("Structured C2Array msgpack payload requires a string 'path'")
141-
urlbase = payload.get("urlbase")
142-
if urlbase is not None and not isinstance(urlbase, str):
143-
raise TypeError("Structured C2Array msgpack payload requires 'urlbase' to be a string or None")
144-
return blosc2.C2Array(path, urlbase=urlbase)
145-
if kind == "dictstore_key":
146-
urlpath = payload.get("urlpath")
147-
if not isinstance(urlpath, str):
148-
raise TypeError("Structured DictStore-key msgpack payload requires a string 'urlpath'")
149-
key = payload.get("key")
150-
if not isinstance(key, str):
151-
raise TypeError("Structured DictStore-key msgpack payload requires a string 'key'")
152-
return blosc2.DictStore(urlpath, mode="r")[key]
153-
if kind == "urlpath":
154-
urlpath = payload.get("urlpath")
155-
if not isinstance(urlpath, str):
156-
raise TypeError("Structured urlpath msgpack payload requires a string 'urlpath'")
157-
return blosc2.open(urlpath, mode="r")
158-
raise ValueError(f"Unsupported structured Blosc2 msgpack payload operand kind: {kind!r}")
100+
return Ref.from_dict(payload).open()
159101

160102

161103
def _decode_structured_reference(data):
@@ -168,6 +110,9 @@ def _decode_structured_reference(data):
168110
raise ValueError(f"Unsupported structured Blosc2 msgpack payload version: {version!r}")
169111

170112
kind = payload.get("kind")
113+
if kind == "ref":
114+
ref_payload = payload.get("ref")
115+
return Ref.from_dict(ref_payload)
171116
if kind == "c2array":
172117
return _decode_operand_reference(payload)
173118
if kind == "lazyexpr":

src/blosc2/dict_store.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,7 @@ def _annotate_external_value(
335335
value: blosc2.NDArray | SChunk | blosc2.VLArray | blosc2.BatchStore | C2Array,
336336
):
337337
"""Attach DictStore origin metadata so structured msgpack can preserve member identity."""
338-
value._msgpack_dictstore_urlpath = self.localpath
339-
value._msgpack_dictstore_key = key
338+
value._blosc2_ref = blosc2.Ref.dictstore_key(self.localpath, key)
340339
return value
341340

342341
@property

src/blosc2/ref.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
from __future__ import annotations
9+
10+
from dataclasses import dataclass
11+
from typing import Any
12+
13+
14+
@dataclass(frozen=True, slots=True)
15+
class Ref:
16+
"""A durable reference to a Blosc2 object.
17+
18+
``Ref`` can describe:
19+
20+
- a persistent local Blosc2 object reopenable from ``urlpath``
21+
- a member inside a :class:`blosc2.DictStore`
22+
- a remote :class:`blosc2.C2Array`
23+
24+
Instances can be created directly, from dictionaries via :meth:`from_dict`,
25+
or from supported objects via :meth:`from_object`. Use :meth:`open` to
26+
resolve the reference back into a live Blosc2 object.
27+
"""
28+
29+
kind: str
30+
urlpath: str | None = None
31+
key: str | None = None
32+
path: str | None = None
33+
urlbase: str | None = None
34+
35+
def __post_init__(self) -> None:
36+
if self.kind == "urlpath":
37+
if not isinstance(self.urlpath, str):
38+
raise TypeError("Ref(kind='urlpath') requires a string 'urlpath'")
39+
if self.key is not None or self.path is not None or self.urlbase is not None:
40+
raise ValueError("Ref(kind='urlpath') only supports the 'urlpath' field")
41+
return
42+
if self.kind == "dictstore_key":
43+
if not isinstance(self.urlpath, str):
44+
raise TypeError("Ref(kind='dictstore_key') requires a string 'urlpath'")
45+
if not isinstance(self.key, str):
46+
raise TypeError("Ref(kind='dictstore_key') requires a string 'key'")
47+
if self.path is not None or self.urlbase is not None:
48+
raise ValueError("Ref(kind='dictstore_key') only supports 'urlpath' and 'key'")
49+
return
50+
if self.kind == "c2array":
51+
if not isinstance(self.path, str):
52+
raise TypeError("Ref(kind='c2array') requires a string 'path'")
53+
if self.urlbase is not None and not isinstance(self.urlbase, str):
54+
raise TypeError("Ref(kind='c2array') requires 'urlbase' to be a string or None")
55+
if self.urlpath is not None or self.key is not None:
56+
raise ValueError("Ref(kind='c2array') only supports 'path' and 'urlbase'")
57+
return
58+
raise ValueError(f"Unsupported Ref kind: {self.kind!r}")
59+
60+
@classmethod
61+
def urlpath_ref(cls, urlpath: str) -> Ref:
62+
return cls(kind="urlpath", urlpath=urlpath)
63+
64+
@classmethod
65+
def dictstore_key(cls, urlpath: str, key: str) -> Ref:
66+
return cls(kind="dictstore_key", urlpath=urlpath, key=key)
67+
68+
@classmethod
69+
def c2array_ref(cls, path: str, urlbase: str | None = None) -> Ref:
70+
return cls(kind="c2array", path=path, urlbase=urlbase)
71+
72+
@classmethod
73+
def from_dict(cls, payload: dict[str, Any]) -> Ref:
74+
if not isinstance(payload, dict):
75+
raise TypeError("Ref payload must be a mapping")
76+
version = payload.get("version")
77+
if version != 1:
78+
raise ValueError(f"Unsupported Ref payload version: {version!r}")
79+
return cls(
80+
kind=payload.get("kind"),
81+
urlpath=payload.get("urlpath"),
82+
key=payload.get("key"),
83+
path=payload.get("path"),
84+
urlbase=payload.get("urlbase"),
85+
)
86+
87+
@classmethod
88+
def from_object(cls, obj: Any) -> Ref:
89+
import blosc2
90+
91+
if isinstance(obj, blosc2.C2Array):
92+
return cls.c2array_ref(obj.path, obj.urlbase)
93+
if isinstance(obj, blosc2.Proxy):
94+
obj = obj._cache
95+
ref = getattr(obj, "_blosc2_ref", None)
96+
if isinstance(ref, cls):
97+
return ref
98+
if hasattr(obj, "schunk"):
99+
urlpath = obj.schunk.urlpath
100+
if urlpath is None:
101+
raise ValueError("Durable Blosc2 references require operands to be stored on disk/network")
102+
return cls.urlpath_ref(urlpath)
103+
raise TypeError("Durable Blosc2 references require NDArray, C2Array, or Proxy operands")
104+
105+
def to_dict(self) -> dict[str, Any]:
106+
payload = {"kind": self.kind, "version": 1}
107+
if self.kind == "urlpath":
108+
payload["urlpath"] = self.urlpath
109+
elif self.kind == "dictstore_key":
110+
payload["urlpath"] = self.urlpath
111+
payload["key"] = self.key
112+
elif self.kind == "c2array":
113+
payload["path"] = self.path
114+
payload["urlbase"] = self.urlbase
115+
return payload
116+
117+
def open(self):
118+
import blosc2
119+
120+
if self.kind == "urlpath":
121+
return blosc2.open(self.urlpath, mode="r")
122+
if self.kind == "dictstore_key":
123+
return blosc2.DictStore(self.urlpath, mode="r")[self.key]
124+
if self.kind == "c2array":
125+
return blosc2.C2Array(self.path, urlbase=self.urlbase)
126+
raise ValueError(f"Unsupported Ref kind: {self.kind!r}")

src/blosc2/schunk.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,20 @@ class vlmeta(MutableMapping, blosc2_ext.vlmeta):
2727
"""
2828
Class providing access to user metadata on an :ref:`SChunk`.
2929
It is available via the `.vlmeta` property of an :ref:`SChunk`.
30+
31+
Values are serialized using the general Blosc2 msgpack extensions; see
32+
:ref:`Msgpack Serialization <MsgpackSerialization>`. Besides ordinary
33+
msgpack-safe Python values, this includes:
34+
35+
- CFrame-backed Blosc2 objects such as :class:`blosc2.NDArray`,
36+
:class:`blosc2.SChunk`, :class:`blosc2.VLArray`,
37+
:class:`blosc2.BatchStore`, and :class:`blosc2.EmbedStore`
38+
- structured references and lazy objects such as :class:`blosc2.Ref`,
39+
:class:`blosc2.C2Array`, :class:`blosc2.LazyExpr`, and
40+
:class:`blosc2.LazyUDF` backed by :func:`blosc2.dsl_kernel`
41+
42+
Lazy expressions and supported lazy UDFs still require durable operand
43+
references only; purely in-memory operands are intentionally rejected.
3044
"""
3145

3246
def __init__(self, schunk, urlpath, mode, mmap_mode, initial_mapping_size):
@@ -72,7 +86,7 @@ def getall(self):
7286
Return all the variable length metalayers as a dictionary
7387
7488
"""
75-
return super().to_dict()
89+
return {name: self[name] for name in self}
7690

7791
def __repr__(self):
7892
return repr(self.getall())

tests/test_batch_store.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ def _make_persistent_python_lazyudf(tmp_path):
9494
return blosc2.lazyudf(_python_udf_add, (a, b), dtype=a.dtype, shape=a.shape)
9595

9696

97+
def _make_persistent_ref(tmp_path):
98+
a = blosc2.asarray(np.arange(5, dtype=np.int64), urlpath=tmp_path / "a_ref.b2nd", mode="w")
99+
return blosc2.Ref.from_object(a), a[:]
100+
101+
97102
@pytest.mark.parametrize(
98103
("contiguous", "urlpath"),
99104
[
@@ -285,6 +290,16 @@ def test_msgpack_supports_c2array(monkeypatch):
285290
assert restored["remote"].auth_token is None
286291

287292

293+
def test_msgpack_supports_ref(tmp_path):
294+
ref, expected = _make_persistent_ref(tmp_path)
295+
296+
restored = msgpack_unpackb(msgpack_packb({"ref": ref}))["ref"]
297+
298+
assert isinstance(restored, blosc2.Ref)
299+
assert restored == ref
300+
np.testing.assert_array_equal(restored.open()[:], expected)
301+
302+
288303
def test_batchstore_msgpack_supports_c2array(monkeypatch):
289304
c2array = _make_c2array(monkeypatch)
290305

tests/test_vlarray.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def _make_persistent_python_lazyudf(tmp_path):
9393
return blosc2.lazyudf(_python_udf_add, (a, b), dtype=a.dtype, shape=a.shape)
9494

9595

96+
def _make_persistent_ref(tmp_path):
97+
a = blosc2.asarray(np.arange(5, dtype=np.int64), urlpath=tmp_path / "a_ref.b2nd", mode="w")
98+
return blosc2.Ref.from_object(a), a[:]
99+
100+
96101
@pytest.mark.parametrize(
97102
("contiguous", "urlpath"),
98103
[
@@ -231,6 +236,19 @@ def test_vlarray_msgpack_supports_c2array(monkeypatch):
231236
assert restored.auth_token is None
232237

233238

239+
def test_vlarray_msgpack_supports_ref(tmp_path):
240+
ref, expected = _make_persistent_ref(tmp_path)
241+
242+
vlarray = blosc2.VLArray()
243+
vlarray.append(ref)
244+
245+
restored = vlarray[0]
246+
247+
assert isinstance(restored, blosc2.Ref)
248+
assert restored == ref
249+
np.testing.assert_array_equal(restored.open()[:], expected)
250+
251+
234252
def test_vlarray_msgpack_supports_lazyexpr(tmp_path):
235253
expr, expected = _make_persistent_lazyexpr(tmp_path)
236254

0 commit comments

Comments
 (0)