Skip to content

Commit 4da9547

Browse files
committed
Speed up exact row gathering for scattered index hits
1 parent d91d4f4 commit 4da9547

4 files changed

Lines changed: 141 additions & 108 deletions

File tree

bench/ndarray/index_query_bench.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,11 @@
2525
DEFAULT_REPEATS = 3
2626
KINDS = ("ultralight", "light", "medium", "full")
2727
DEFAULT_KIND = "light"
28-
DISTS = ("sorted", "block-shuffled", "permuted")
28+
DISTS = ("sorted", "block-shuffled", "permuted", "random")
2929
RNG_SEED = 0
3030
DEFAULT_OPLEVEL = 5
3131
FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load")
32+
DATASET_LAYOUT_VERSION = "payload-ramp-v1"
3233

3334
COLD_COLUMNS = [
3435
("rows", lambda result: f"{result['size']:,}"),
@@ -72,6 +73,11 @@ def source_dtype(id_dtype: np.dtype) -> np.dtype:
7273
return np.dtype([("id", np.dtype(id_dtype)), ("payload", np.float32)])
7374

7475

76+
def payload_slice(start: int, stop: int) -> np.ndarray:
77+
"""Deterministic nontrivial payload values for structured benchmark rows."""
78+
return np.arange(start, stop, dtype=np.float32)
79+
80+
7581
def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray:
7682
dtype = np.dtype(dtype)
7783
if dtype == np.dtype(np.bool_):
@@ -240,6 +246,12 @@ def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step:
240246
ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype)
241247

242248

249+
def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray:
250+
ids = make_ordered_ids(size, dtype)
251+
np.random.default_rng(RNG_SEED).shuffle(ids)
252+
return ids
253+
254+
243255
def build_persistent_array(
244256
size: int, dist: str, id_dtype: np.dtype, path: Path, chunks: int | None, blocks: int | None
245257
) -> blosc2.NDArray:
@@ -254,6 +266,7 @@ def build_persistent_array(
254266
block_len = int(arr.blocks[0])
255267
block_order = _block_order(size, block_len) if dist == "block-shuffled" else None
256268
permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0)
269+
random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None
257270
for start in range(0, size, chunk_len):
258271
stop = min(start + chunk_len, size)
259272
chunk = np.zeros(stop - start, dtype=dtype)
@@ -263,14 +276,20 @@ def build_persistent_array(
263276
_fill_block_shuffled_ids(chunk["id"], size, start, stop, block_len, block_order)
264277
elif dist == "permuted":
265278
_fill_permuted_ids(chunk["id"], size, start, stop, permuted_step, permuted_offset)
279+
elif dist == "random":
280+
chunk["id"] = random_ids[start:stop]
266281
else:
267282
raise ValueError(f"unsupported distribution {dist!r}")
283+
chunk["payload"] = payload_slice(start, stop)
268284
arr[start:stop] = chunk
269285
return arr
270286

271287

272288
def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> Path:
273-
return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.b2nd"
289+
return (
290+
size_dir
291+
/ f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.{geometry_token(chunks, blocks)}.b2nd"
292+
)
274293

275294

276295
def indexed_array_path(
@@ -293,7 +312,7 @@ def indexed_array_path(
293312
thread_token = "threads-auto" if nthreads is None else f"threads-{nthreads}"
294313
return (
295314
size_dir
296-
/ f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{codec_token}.{clevel_token}.{thread_token}"
315+
/ f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.{geometry_token(chunks, blocks)}.{codec_token}.{clevel_token}.{thread_token}"
297316
f".{kind}.opt{optlevel}.{mode}.b2nd"
298317
)
299318

doc/getting_started/tutorials/14.indexing-arrays.ipynb

Lines changed: 52 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@
3232
"id": "8c510216bc394cf9",
3333
"metadata": {
3434
"ExecuteTime": {
35-
"end_time": "2026-04-09T03:17:03.228163Z",
36-
"start_time": "2026-04-09T03:17:03.209583Z"
35+
"end_time": "2026-04-09T04:14:37.432863Z",
36+
"start_time": "2026-04-09T04:14:37.110770Z"
3737
}
3838
},
3939
"source": [
@@ -93,7 +93,7 @@
9393
" blosc2.remove_urlpath(path)"
9494
],
9595
"outputs": [],
96-
"execution_count": 11
96+
"execution_count": 1
9797
},
9898
{
9999
"cell_type": "markdown",
@@ -119,8 +119,8 @@
119119
"id": "d1a5a37585a045ca",
120120
"metadata": {
121121
"ExecuteTime": {
122-
"end_time": "2026-04-09T03:17:19.644470Z",
123-
"start_time": "2026-04-09T03:17:05.289092Z"
122+
"end_time": "2026-04-09T04:14:51.428855Z",
123+
"start_time": "2026-04-09T04:14:37.433671Z"
124124
}
125125
},
126126
"source": [
@@ -129,19 +129,15 @@
129129
"\n",
130130
"rng = np.random.default_rng(0)\n",
131131
"dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n",
132-
"data = blosc2.zeros(N_ROWS, dtype=dtype)[:]\n",
133-
"# Build a predictable id column, then shuffle it so the source data is not already ordered.\n",
134-
"data[\"id\"] = blosc2.arange(-data.shape[0] // 2, data.shape[0] // 2, dtype=np.float64)\n",
135-
"rng.shuffle(data[\"id\"])\n",
136-
"data[\"payload\"] = blosc2.arange(data.shape[0], dtype=np.int32)\n",
132+
"ids = np.arange(-N_ROWS // 2, N_ROWS // 2, dtype=np.float64)\n",
133+
"rng.shuffle(ids)\n",
134+
"data = blosc2.fromiter(((id_, i) for i, id_ in enumerate(ids)), shape=(N_ROWS,), dtype=dtype)\n",
137135
"\n",
138136
"indexed_arrays = {}\n",
139137
"build_rows = []\n",
140-
"base_cbytes = None\n",
138+
"base_cbytes = data.cbytes\n",
141139
"for kind in (\"ultralight\", \"light\", \"medium\", \"full\"):\n",
142-
" arr = blosc2.asarray(data)\n",
143-
" if base_cbytes is None:\n",
144-
" base_cbytes = arr.cbytes\n",
140+
" arr = data.copy()\n",
145141
" t0 = time.perf_counter()\n",
146142
" arr.create_index(field=\"id\", kind=kind)\n",
147143
" build_ms = (time.perf_counter() - t0) * 1e3\n",
@@ -161,14 +157,14 @@
161157
"text": [
162158
"Compressed base array size: 30.74 MiB\n",
163159
"kind build_ms index_size overhead\n",
164-
"ultralight 54.423 142 B 0.00x\n",
165-
"light 748.307 26.04 MiB 0.85x\n",
166-
"medium 2453.256 34.99 MiB 1.14x\n",
167-
"full 8727.665 28.44 MiB 0.93x\n"
160+
"ultralight 45.783 142 B 0.00x\n",
161+
"light 674.304 26.04 MiB 0.85x\n",
162+
"medium 2195.323 34.99 MiB 1.14x\n",
163+
"full 8483.835 28.44 MiB 0.93x\n"
168164
]
169165
}
170166
],
171-
"execution_count": 12
167+
"execution_count": 2
172168
},
173169
{
174170
"cell_type": "markdown",
@@ -185,8 +181,8 @@
185181
"id": "f1b3aaec965b42d6",
186182
"metadata": {
187183
"ExecuteTime": {
188-
"end_time": "2026-04-09T02:55:34.424905Z",
189-
"start_time": "2026-04-09T02:55:34.206800Z"
184+
"end_time": "2026-04-09T04:14:51.546053Z",
185+
"start_time": "2026-04-09T04:14:51.449229Z"
190186
}
191187
},
192188
"source": [
@@ -227,8 +223,8 @@
227223
"id": "c9e932b7561b4ff4",
228224
"metadata": {
229225
"ExecuteTime": {
230-
"end_time": "2026-04-09T02:55:38.336928Z",
231-
"start_time": "2026-04-09T02:55:34.432852Z"
226+
"end_time": "2026-04-09T04:14:53.105689Z",
227+
"start_time": "2026-04-09T04:14:51.548648Z"
232228
}
233229
},
234230
"source": [
@@ -258,10 +254,10 @@
258254
"text": [
259255
"Selective mask over 10,000,000 rows\n",
260256
"kind scan_ms index_ms speedup\n",
261-
"ultralight 161.539 174.253 0.93x\n",
262-
"light 197.265 23.717 8.32x\n",
263-
"medium 157.424 23.681 6.65x\n",
264-
"full 157.527 25.280 6.23x\n"
257+
"ultralight 70.429 67.914 1.04x\n",
258+
"light 68.560 5.011 13.68x\n",
259+
"medium 68.481 4.430 15.46x\n",
260+
"full 68.408 4.263 16.05x\n"
265261
]
266262
}
267263
],
@@ -282,8 +278,8 @@
282278
"id": "9ffcb0d8d06a4daa",
283279
"metadata": {
284280
"ExecuteTime": {
285-
"end_time": "2026-04-09T02:55:38.375308Z",
286-
"start_time": "2026-04-09T02:55:38.346312Z"
281+
"end_time": "2026-04-09T04:14:53.160261Z",
282+
"start_time": "2026-04-09T04:14:53.118529Z"
287283
}
288284
},
289285
"source": [
@@ -327,8 +323,8 @@
327323
"id": "7d337ce2f9fb4f32",
328324
"metadata": {
329325
"ExecuteTime": {
330-
"end_time": "2026-04-09T02:55:38.402494Z",
331-
"start_time": "2026-04-09T02:55:38.376197Z"
326+
"end_time": "2026-04-09T04:14:53.206514Z",
327+
"start_time": "2026-04-09T04:14:53.171092Z"
332328
}
333329
},
334330
"source": [
@@ -374,8 +370,8 @@
374370
"id": "0be5f512928f48db",
375371
"metadata": {
376372
"ExecuteTime": {
377-
"end_time": "2026-04-09T02:55:40.518254Z",
378-
"start_time": "2026-04-09T02:55:38.403529Z"
373+
"end_time": "2026-04-09T04:14:55.722443Z",
374+
"start_time": "2026-04-09T04:14:53.207978Z"
379375
}
380376
},
381377
"source": [
@@ -392,9 +388,19 @@
392388
"name": "stdout",
393389
"output_type": "stream",
394390
"text": [
395-
"persistent medium: kind=medium, persistent=True, ooc=True, stale=False\n",
396-
"Reopened index count: 1\n",
397-
"Persisted sidecar path: indexing_tutorial_medium.__index__.id.medium.reduced.values.b2nd\n"
391+
"persistent medium: kind=medium, persistent=False, ooc=True, stale=False\n"
392+
]
393+
},
394+
{
395+
"ename": "FileNotFoundError",
396+
"evalue": "No such file or directory: indexing_tutorial_medium.b2nd",
397+
"output_type": "error",
398+
"traceback": [
399+
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
400+
"\u001B[31mFileNotFoundError\u001B[39m Traceback (most recent call last)",
401+
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[7]\u001B[39m\u001B[32m, line 5\u001B[39m\n\u001B[32m 2\u001B[39m persistent_descriptor = persistent_arr.create_index(field=\u001B[33m\"\u001B[39m\u001B[33mid\u001B[39m\u001B[33m\"\u001B[39m, kind=\u001B[33m\"\u001B[39m\u001B[33mmedium\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 3\u001B[39m show_index_summary(\u001B[33m\"\u001B[39m\u001B[33mpersistent medium\u001B[39m\u001B[33m\"\u001B[39m, persistent_descriptor)\n\u001B[32m----> \u001B[39m\u001B[32m5\u001B[39m reopened = \u001B[43mblosc2\u001B[49m\u001B[43m.\u001B[49m\u001B[43mopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpaths\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmode\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43ma\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 6\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mReopened index count: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mlen\u001B[39m(reopened.indexes)\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 7\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mPersisted sidecar path: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mreopened.indexes[\u001B[32m0\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mreduced\u001B[39m\u001B[33m'\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mvalues_path\u001B[39m\u001B[33m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n",
402+
"\u001B[36mFile \u001B[39m\u001B[32m~/blosc/python-blosc2/src/blosc2/schunk.py:1779\u001B[39m, in \u001B[36mopen\u001B[39m\u001B[34m(urlpath, mode, offset, **kwargs)\u001B[39m\n\u001B[32m 1776\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m special\n\u001B[32m 1778\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os.path.exists(urlpath):\n\u001B[32m-> \u001B[39m\u001B[32m1779\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mFileNotFoundError\u001B[39;00m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mNo such file or directory: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00murlpath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 1781\u001B[39m _set_default_dparams(kwargs)\n\u001B[32m 1782\u001B[39m res = blosc2_ext.open(urlpath, mode, offset, **kwargs)\n",
403+
"\u001B[31mFileNotFoundError\u001B[39m: No such file or directory: indexing_tutorial_medium.b2nd"
398404
]
399405
}
400406
],
@@ -413,12 +419,7 @@
413419
{
414420
"cell_type": "code",
415421
"id": "11f0cd1b910b409a",
416-
"metadata": {
417-
"ExecuteTime": {
418-
"end_time": "2026-04-09T02:55:40.559125Z",
419-
"start_time": "2026-04-09T02:55:40.527487Z"
420-
}
421-
},
422+
"metadata": {},
422423
"source": [
423424
"mutable_arr = blosc2.arange(20, dtype=np.int64)\n",
424425
"mutable_arr.create_index(kind=\"full\")\n",
@@ -428,17 +429,8 @@
428429
"mutable_arr.rebuild_index()\n",
429430
"print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])"
430431
],
431-
"outputs": [
432-
{
433-
"name": "stdout",
434-
"output_type": "stream",
435-
"text": [
436-
"Stale after direct mutation: True\n",
437-
"Stale after rebuild: False\n"
438-
]
439-
}
440-
],
441-
"execution_count": 8
432+
"outputs": [],
433+
"execution_count": null
442434
},
443435
{
444436
"cell_type": "markdown",
@@ -462,12 +454,7 @@
462454
{
463455
"cell_type": "code",
464456
"id": "2e1a47a9cf7246e6",
465-
"metadata": {
466-
"ExecuteTime": {
467-
"end_time": "2026-04-09T02:55:41.338819Z",
468-
"start_time": "2026-04-09T02:55:40.559982Z"
469-
}
470-
},
457+
"metadata": {},
471458
"source": [
472459
"append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n",
473460
"base_rows = 200_000\n",
@@ -505,22 +492,8 @@
505492
"print(f\"Median mask time after compaction: {after_ms:.3f} ms\")\n",
506493
"print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")"
507494
],
508-
"outputs": [
509-
{
510-
"name": "stdout",
511-
"output_type": "stream",
512-
"text": [
513-
"Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n",
514-
"Pending runs: 40\n",
515-
"Median mask time before compaction: 3.514 ms\n",
516-
"After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n",
517-
"Pending runs: 0\n",
518-
"Median mask time after compaction: 0.638 ms\n",
519-
"Speedup after compaction: 5.51x\n"
520-
]
521-
}
522-
],
523-
"execution_count": 9
495+
"outputs": [],
496+
"execution_count": null
524497
},
525498
{
526499
"cell_type": "markdown",
@@ -540,31 +513,21 @@
540513
{
541514
"cell_type": "code",
542515
"id": "9833102355db4ec0",
543-
"metadata": {
544-
"ExecuteTime": {
545-
"end_time": "2026-04-09T02:55:41.360620Z",
546-
"start_time": "2026-04-09T02:55:41.350276Z"
547-
}
548-
},
516+
"metadata": {},
549517
"source": [
550518
"for path in paths:\n",
551519
" blosc2.remove_urlpath(path)"
552520
],
553521
"outputs": [],
554-
"execution_count": 10
522+
"execution_count": null
555523
},
556524
{
557525
"cell_type": "code",
558526
"id": "17489b2c3d2ac57",
559-
"metadata": {
560-
"ExecuteTime": {
561-
"end_time": "2026-04-09T02:55:41.380135Z",
562-
"start_time": "2026-04-09T02:55:41.368555Z"
563-
}
564-
},
527+
"metadata": {},
565528
"source": [],
566529
"outputs": [],
567-
"execution_count": 10
530+
"execution_count": null
568531
}
569532
],
570533
"metadata": {

0 commit comments

Comments
 (0)