Skip to content

Commit c4b00d9

Browse files
Merge pull request #617 from Blosc/fromiter-improv
Rework `fromiter()` with direct chunked construction
2 parents 44f4883 + b25e044 commit c4b00d9

3 files changed

Lines changed: 595 additions & 26 deletions

File tree

bench/ndarray/fromiter.py

Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
#######################################################################
2+
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
3+
# All rights reserved.
4+
#
5+
# SPDX-License-Identifier: BSD-3-Clause
6+
#######################################################################
7+
8+
"""Benchmark for blosc2.fromiter() — Phase 3 performance baseline.
9+
10+
Covers the three Phase 3 tuning axes:
11+
12+
1. Chunk buffer allocation / reuse
13+
Varies chunk shapes for a fixed total array size to expose allocation
14+
overhead per chunk and the cost of many small vs. few large chunks.
15+
16+
2. Chunk traversal strategies
17+
Compares c_order=True (full in-memory buffer) vs c_order=False
18+
(streaming chunk-by-chunk) for the same multidimensional array.
19+
20+
3. On-disk vs. in-memory targets
21+
Runs each case with and without a urlpath so that I/O overhead can be
22+
separated from construction overhead.
23+
24+
Usage::
25+
26+
python bench/ndarray/fromiter.py # default: in-memory only
27+
python bench/ndarray/fromiter.py --on-disk # also run on-disk cases
28+
python bench/ndarray/fromiter.py --nreps 5 # more repetitions
29+
python bench/ndarray/fromiter.py --dtype float32
30+
python bench/ndarray/fromiter.py --help
31+
"""
32+
33+
from __future__ import annotations
34+
35+
import argparse
36+
import gc
37+
import math
38+
import os
39+
import shutil
40+
import time
41+
42+
import numpy as np
43+
44+
import blosc2
45+
46+
47+
# ---------------------------------------------------------------------------
48+
# Helpers
49+
# ---------------------------------------------------------------------------
50+
51+
def make_iterator(total: int, dtype: np.dtype):
52+
"""Return a fresh generator of *total* values cast to *dtype*."""
53+
# Use a generator so the iterable is one-shot (stress-tests the
54+
# implementation's single-pass contract).
55+
return (dtype.type(i % 1000) for i in range(total))
56+
57+
58+
def measure(fn, nreps: int) -> tuple[float, float]:
59+
"""Run *fn* *nreps* times and return (best, mean) wall-clock seconds."""
60+
times = []
61+
for _ in range(nreps):
62+
gc.collect()
63+
t0 = time.perf_counter()
64+
fn()
65+
times.append(time.perf_counter() - t0)
66+
return min(times), sum(times) / len(times)
67+
68+
69+
def array_info(a: blosc2.NDArray) -> str:
70+
nb = a.schunk.nbytes
71+
cb = a.schunk.cbytes
72+
return (
73+
f"{nb / 2**20:8.1f} MB uncompressed "
74+
f"cratio {nb / cb:4.1f}x "
75+
f"({cb / 2**20:.1f} MB on storage)"
76+
)
77+
78+
79+
def print_result(label: str, best: float, mean: float, nbytes: int) -> None:
80+
gb = nbytes / 2**30
81+
print(
82+
f" {label:<45s} best {best:.3f}s ({gb / best:.2f} GB/s)"
83+
f" mean {mean:.3f}s"
84+
)
85+
86+
87+
def cleanup(urlpath: str | None) -> None:
88+
if urlpath is None:
89+
return
90+
if os.path.isdir(urlpath):
91+
shutil.rmtree(urlpath)
92+
elif os.path.exists(urlpath):
93+
os.remove(urlpath)
94+
95+
96+
# ---------------------------------------------------------------------------
97+
# Benchmark sections
98+
# ---------------------------------------------------------------------------
99+
100+
def bench_chunk_sizes(dtype: np.dtype, nreps: int, on_disk: bool) -> None:
101+
"""
102+
Section 1 — Chunk buffer allocation / reuse (optimisation A).
103+
104+
Fixed total size, varying chunk shapes. Exposes per-chunk allocation
105+
overhead: many tiny chunks vs. a few large chunks, and shows the impact
106+
of the page buffer on c_order=False.
107+
"""
108+
print("\n" + "=" * 70)
109+
print("Section 1 — Chunk buffer allocation / reuse (opt A: page buffer)")
110+
print(f" Fixed shape (1000, 1000), dtype={dtype}, nreps={nreps}")
111+
print("=" * 70)
112+
113+
shape = (1000, 1000)
114+
total = math.prod(shape)
115+
nbytes = total * dtype.itemsize
116+
117+
chunk_configs = [
118+
# (chunks, blocks, label)
119+
((10, 10), (5, 5), "chunks=(10,10) — many tiny"),
120+
((50, 50), (25, 25), "chunks=(50,50) — medium"),
121+
((100, 100), (50, 50), "chunks=(100,100) — medium-large"),
122+
((200, 200), (100, 100), "chunks=(200,200) — large"),
123+
((500, 500), (250, 250), "chunks=(500,500) — very large"),
124+
((1000, 100), (500, 50), "chunks=(1000,100) — full-row strip"),
125+
((1000, 1000),(500, 500), "chunks=shape — single chunk"),
126+
]
127+
128+
for order_label, c_order in (("c_order=True ", True), ("c_order=False", False)):
129+
print(f"\n {order_label}")
130+
for chunks, blocks, clabel in chunk_configs:
131+
urlpath = "fromiter_bench.b2nd" if on_disk else None
132+
133+
def run(c=chunks, b=blocks, u=urlpath, co=c_order):
134+
cleanup(u)
135+
blosc2.fromiter(
136+
make_iterator(total, dtype),
137+
shape=shape, dtype=dtype,
138+
chunks=c, blocks=b,
139+
c_order=co,
140+
urlpath=u, mode="w" if u else None,
141+
)
142+
143+
best, mean = measure(run, nreps)
144+
cleanup(urlpath)
145+
disk_tag = " [disk]" if on_disk else ""
146+
print_result(f"{clabel}{disk_tag}", best, mean, nbytes)
147+
148+
149+
def bench_corder(dtype: np.dtype, nreps: int, on_disk: bool) -> None:
150+
"""
151+
Section 2 — Chunk traversal strategies: c_order=True vs c_order=False.
152+
153+
Runs the same shapes/chunk configs with both orderings so that the
154+
trade-off between in-memory buffering and streaming chunk fill is visible.
155+
"""
156+
print("\n" + "=" * 70)
157+
print("Section 2 — Chunk traversal: c_order=True vs c_order=False")
158+
print(f" dtype={dtype}, nreps={nreps}")
159+
print("=" * 70)
160+
161+
cases = [
162+
# (shape, chunks, blocks, label)
163+
((500, 500), (50, 50), (25, 25), "2-D (500,500) chunks=(50,50)"),
164+
((200, 200, 200), (20, 20, 20),(10, 10, 10),"3-D (200,200,200) chunks=(20,20,20)"),
165+
((50, 50, 50, 50), (10, 10, 10, 10),(5,5,5,5),"4-D (50,50,50,50) chunks=(10,10,10,10)"),
166+
]
167+
168+
for shape, chunks, blocks, label in cases:
169+
total = math.prod(shape)
170+
nbytes = total * dtype.itemsize
171+
print(f"\n {label} [{nbytes / 2**20:.1f} MB]")
172+
173+
for order_label, c_order in (("c_order=True ", True), ("c_order=False", False)):
174+
for disk_label, use_disk in (("in-memory", False), ("on-disk ", True)):
175+
if use_disk and not on_disk:
176+
continue
177+
urlpath = "fromiter_bench.b2nd" if use_disk else None
178+
179+
def run(s=shape, c=chunks, b=blocks, u=urlpath, co=c_order):
180+
cleanup(u)
181+
blosc2.fromiter(
182+
make_iterator(total, dtype),
183+
shape=s, dtype=dtype,
184+
chunks=c, blocks=b,
185+
c_order=co,
186+
urlpath=u, mode="w" if u else None,
187+
)
188+
189+
best, mean = measure(run, nreps)
190+
cleanup(urlpath)
191+
print_result(f" {order_label} {disk_label}", best, mean, nbytes)
192+
193+
194+
def bench_ondisk_vs_memory(dtype: np.dtype, nreps: int) -> None:
195+
"""
196+
Section 3 — On-disk vs. in-memory targets.
197+
198+
Side-by-side comparison for a large-ish array so that I/O overhead
199+
is clearly separated from construction cost.
200+
"""
201+
print("\n" + "=" * 70)
202+
print("Section 3 — On-disk vs. in-memory")
203+
print(f" dtype={dtype}, nreps={nreps}")
204+
print("=" * 70)
205+
206+
shape = (2000, 2000)
207+
chunks = (200, 200)
208+
blocks = (100, 100)
209+
total = math.prod(shape)
210+
nbytes = total * dtype.itemsize
211+
print(f" shape={shape} chunks={chunks} [{nbytes / 2**20:.1f} MB]")
212+
213+
for order_label, c_order in (("c_order=True ", True), ("c_order=False", False)):
214+
print(f"\n {order_label}")
215+
for disk_label, urlpath in (("in-memory", None), ("on-disk ", "fromiter_bench.b2nd")):
216+
217+
def run(u=urlpath, co=c_order):
218+
cleanup(u)
219+
a = blosc2.fromiter(
220+
make_iterator(total, dtype),
221+
shape=shape, dtype=dtype,
222+
chunks=chunks, blocks=blocks,
223+
c_order=co,
224+
urlpath=u, mode="w" if u else None,
225+
)
226+
return a
227+
228+
best, mean = measure(run, nreps)
229+
cleanup(urlpath)
230+
print_result(f" {disk_label}", best, mean, nbytes)
231+
232+
233+
def bench_large(dtype: np.dtype, nreps: int, on_disk: bool) -> None:
234+
"""
235+
Bonus — large array for headline throughput numbers.
236+
237+
Includes the numpy fast path (optimisation C) when the iterable is
238+
already a numpy array, which completely bypasses Python iteration.
239+
"""
240+
print("\n" + "=" * 70)
241+
print("Bonus — Large array headline throughput (opt C: numpy fast path)")
242+
print(f" dtype={dtype}, nreps={nreps}")
243+
print("=" * 70)
244+
245+
shape = (5000, 5000)
246+
chunks = (500, 500)
247+
blocks = (250, 250)
248+
total = math.prod(shape)
249+
nbytes = total * dtype.itemsize
250+
print(f" shape={shape} [{nbytes / 2**20:.0f} MB]")
251+
252+
# NumPy baseline (pure Python generator)
253+
def np_run():
254+
np.fromiter(make_iterator(total, dtype), dtype=dtype, count=total).reshape(shape)
255+
256+
best, mean = measure(np_run, nreps)
257+
print_result(" NumPy fromiter+reshape (generator baseline)", best, mean, nbytes)
258+
259+
# blosc2 with generator
260+
for order_label, c_order in (("c_order=True ", True), ("c_order=False", False)):
261+
for disk_label, use_disk in (("in-memory", False), ("on-disk ", True)):
262+
if use_disk and not on_disk:
263+
continue
264+
urlpath = "fromiter_bench_large.b2nd" if use_disk else None
265+
266+
def run(s=shape, c=chunks, b=blocks, u=urlpath, co=c_order):
267+
cleanup(u)
268+
blosc2.fromiter(
269+
make_iterator(total, dtype),
270+
shape=s, dtype=dtype,
271+
chunks=c, blocks=b,
272+
c_order=co,
273+
urlpath=u, mode="w" if u else None,
274+
)
275+
276+
best, mean = measure(run, nreps)
277+
cleanup(urlpath)
278+
print_result(f" blosc2 generator {order_label} {disk_label}", best, mean, nbytes)
279+
280+
# Optimisation C: numpy fast path — iterable is already an ndarray
281+
print()
282+
src = np.fromiter(make_iterator(total, dtype), dtype=dtype, count=total).reshape(shape)
283+
for disk_label, use_disk in (("in-memory", False), ("on-disk ", True)):
284+
if use_disk and not on_disk:
285+
continue
286+
urlpath = "fromiter_bench_large.b2nd" if use_disk else None
287+
288+
def run_np(s=shape, c=chunks, b=blocks, u=urlpath, arr=src):
289+
cleanup(u)
290+
blosc2.fromiter(arr, shape=s, dtype=dtype, chunks=c, blocks=b,
291+
urlpath=u, mode="w" if u else None)
292+
293+
best, mean = measure(run_np, nreps)
294+
cleanup(urlpath)
295+
print_result(f" blosc2 ndarray fast path {disk_label}", best, mean, nbytes)
296+
297+
298+
# ---------------------------------------------------------------------------
299+
# CLI
300+
# ---------------------------------------------------------------------------
301+
302+
def parse_args() -> argparse.Namespace:
303+
p = argparse.ArgumentParser(
304+
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
305+
)
306+
p.add_argument("--dtype", default="float64", help="NumPy dtype (default: float64)")
307+
p.add_argument("--nreps", type=int, default=3, help="Repetitions per measurement (default: 3)")
308+
p.add_argument(
309+
"--on-disk",
310+
action="store_true",
311+
default=False,
312+
help="Also run on-disk cases (writes temporary .b2nd files)",
313+
)
314+
p.add_argument("--section", type=int, default=0,
315+
help="Run only section N (1-3 + bonus=4); 0 = all (default: 0)")
316+
return p.parse_args()
317+
318+
319+
def main() -> None:
320+
args = parse_args()
321+
dtype = np.dtype(args.dtype)
322+
nreps = args.nreps
323+
on_disk = args.on_disk
324+
325+
print(f"\nblosc2.fromiter() benchmark — dtype={dtype} nreps={nreps} on_disk={on_disk}")
326+
print(f"blosc2 version: {blosc2.__version__}")
327+
328+
sections = {
329+
1: lambda: bench_chunk_sizes(dtype, nreps, on_disk),
330+
2: lambda: bench_corder(dtype, nreps, on_disk),
331+
3: lambda: bench_ondisk_vs_memory(dtype, nreps) if on_disk else print(
332+
"\nSection 3 skipped (use --on-disk to enable)"
333+
),
334+
4: lambda: bench_large(dtype, nreps, on_disk),
335+
}
336+
337+
if args.section == 0:
338+
for fn in sections.values():
339+
fn()
340+
else:
341+
sections[args.section]()
342+
343+
print()
344+
345+
346+
if __name__ == "__main__":
347+
main()

0 commit comments

Comments
 (0)