Skip to content

Commit 470fe8a

Browse files
committed
Use latest c-blosc2 sources to fix contiguous-frame b2nd resize growth
1 parent f0b928f commit 470fe8a

2 files changed

Lines changed: 132 additions & 1 deletion

File tree

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ else()
119119
include(FetchContent)
120120
FetchContent_Declare(blosc2
121121
GIT_REPOSITORY https://github.com/Blosc/c-blosc2
122-
GIT_TAG 8a1af55c510fac5f5f20a8b8d0853f4dfaf9e438 # variable-length chunks support in schunks
122+
GIT_TAG 9200990b189c8357e5517860cfa9ef09cb117eae
123123
# SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2
124124
)
125125
FetchContent_MakeAvailable(blosc2)

bench/ndarray/resize.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
from __future__ import annotations
9+
10+
import argparse
11+
import os
12+
import time
13+
14+
import numpy as np
15+
16+
import blosc2
17+
18+
19+
def parse_nitems(text: str) -> int:
20+
suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000}
21+
text = text.strip().lower()
22+
if text[-1:] in suffixes:
23+
return int(float(text[:-1]) * suffixes[text[-1]])
24+
return int(text)
25+
26+
27+
def sizeof_path(path: str) -> int:
28+
if os.path.isdir(path):
29+
total = 0
30+
for root, _, files in os.walk(path):
31+
for name in files:
32+
total += os.path.getsize(os.path.join(root, name))
33+
return total
34+
return os.path.getsize(path)
35+
36+
37+
def format_bytes(nbytes: int) -> str:
38+
units = ["B", "KiB", "MiB", "GiB", "TiB"]
39+
value = float(nbytes)
40+
for unit in units:
41+
if value < 1024 or unit == units[-1]:
42+
return f"{value:.2f} {unit}"
43+
value /= 1024
44+
return f"{nbytes} B"
45+
46+
47+
def pick_layout(nitems: int) -> tuple[tuple[int], tuple[int]]:
48+
chunks = (max(1, min(nitems, 16_384)),)
49+
blocks = (max(1, min(chunks[0], 256)),)
50+
return chunks, blocks
51+
52+
53+
def create_extended_array(
54+
path: str, nitems: int, dtype: np.dtype, chunks: tuple[int], blocks: tuple[int], bsize: int
55+
) -> blosc2.NDArray:
56+
array = blosc2.empty((0,), dtype=dtype, chunks=chunks, blocks=blocks, urlpath=path, mode="w")
57+
for start in range(0, nitems, bsize):
58+
stop = min(start + bsize, nitems)
59+
array.resize((stop,))
60+
array[start:stop] = np.arange(start, stop, dtype=dtype)
61+
return array
62+
63+
64+
def create_full_array(path: str, data: np.ndarray, chunks: tuple[int], blocks: tuple[int]) -> blosc2.NDArray:
65+
return blosc2.asarray(data, chunks=chunks, blocks=blocks, urlpath=path, mode="w")
66+
67+
68+
def time_random_access(array: blosc2.NDArray, indices: np.ndarray) -> tuple[float, int]:
69+
total = 0
70+
t0 = time.perf_counter_ns()
71+
for index in indices:
72+
total += int(array[int(index)])
73+
elapsed_ns = time.perf_counter_ns() - t0
74+
return elapsed_ns / len(indices) / 1_000_000, total
75+
76+
77+
def main() -> None:
78+
parser = argparse.ArgumentParser(
79+
description="Compare resizing an on-disk NDArray in batches vs creating it in one go."
80+
)
81+
parser.add_argument("--nitems", type=parse_nitems, default=parse_nitems("1M"))
82+
parser.add_argument("--bsize", type=parse_nitems, default=parse_nitems("1K"))
83+
parser.add_argument("--samples", type=int, default=10_000)
84+
parser.add_argument("--seed", type=int, default=0)
85+
parser.add_argument("--dtype", default="int64")
86+
parser.add_argument("--extended-path", default="resize-batched.b2nd")
87+
parser.add_argument("--full-path", default="resize-one-go.b2nd")
88+
args = parser.parse_args()
89+
90+
dtype = np.dtype(args.dtype)
91+
chunks, blocks = pick_layout(args.nitems)
92+
data = np.arange(args.nitems, dtype=dtype)
93+
rng = np.random.default_rng(args.seed)
94+
indices = rng.integers(0, args.nitems, size=args.samples)
95+
96+
for path in (args.extended_path, args.full_path):
97+
blosc2.remove_urlpath(path)
98+
99+
t0 = time.perf_counter()
100+
extended = create_extended_array(args.extended_path, args.nitems, dtype, chunks, blocks, args.bsize)
101+
extend_time = time.perf_counter() - t0
102+
103+
t0 = time.perf_counter()
104+
full = create_full_array(args.full_path, data, chunks, blocks)
105+
full_time = time.perf_counter() - t0
106+
107+
extended_size = sizeof_path(args.extended_path)
108+
full_size = sizeof_path(args.full_path)
109+
110+
extended_access_ns, extended_checksum = time_random_access(extended, indices)
111+
full_access_ns, full_checksum = time_random_access(full, indices)
112+
113+
print(f"nitems: {args.nitems:_}")
114+
print(f"dtype: {dtype}")
115+
print(f"chunks: {chunks}")
116+
print(f"blocks: {blocks}")
117+
print(f"batch size: {args.bsize:_}")
118+
print(f"resize build time: {extend_time:.3f} s")
119+
print(f"one-go build time: {full_time:.3f} s")
120+
print(f"resized array file size: {extended_size} bytes ({format_bytes(extended_size)})")
121+
print(f"one-go array file size: {full_size} bytes ({format_bytes(full_size)})")
122+
print(f"random access samples: {args.samples:_}")
123+
print(f"resized array random access: {extended_access_ns:.6f} ms/item")
124+
print(f"one-go array random access: {full_access_ns:.6f} ms/item")
125+
126+
if extended_checksum != full_checksum:
127+
raise RuntimeError("Random-access checksums differ between arrays")
128+
129+
130+
if __name__ == "__main__":
131+
main()

0 commit comments

Comments
 (0)