|
32 | 32 | "id": "8c510216bc394cf9", |
33 | 33 | "metadata": { |
34 | 34 | "ExecuteTime": { |
35 | | - "end_time": "2026-04-09T03:17:03.228163Z", |
36 | | - "start_time": "2026-04-09T03:17:03.209583Z" |
| 35 | + "end_time": "2026-04-09T04:14:37.432863Z", |
| 36 | + "start_time": "2026-04-09T04:14:37.110770Z" |
37 | 37 | } |
38 | 38 | }, |
39 | 39 | "source": [ |
|
93 | 93 | " blosc2.remove_urlpath(path)" |
94 | 94 | ], |
95 | 95 | "outputs": [], |
96 | | - "execution_count": 11 |
| 96 | + "execution_count": 1 |
97 | 97 | }, |
98 | 98 | { |
99 | 99 | "cell_type": "markdown", |
|
119 | 119 | "id": "d1a5a37585a045ca", |
120 | 120 | "metadata": { |
121 | 121 | "ExecuteTime": { |
122 | | - "end_time": "2026-04-09T03:17:19.644470Z", |
123 | | - "start_time": "2026-04-09T03:17:05.289092Z" |
| 122 | + "end_time": "2026-04-09T04:14:51.428855Z", |
| 123 | + "start_time": "2026-04-09T04:14:37.433671Z" |
124 | 124 | } |
125 | 125 | }, |
126 | 126 | "source": [ |
|
129 | 129 | "\n", |
130 | 130 | "rng = np.random.default_rng(0)\n", |
131 | 131 | "dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n", |
132 | | - "data = blosc2.zeros(N_ROWS, dtype=dtype)[:]\n", |
133 | | - "# Build a predictable id column, then shuffle it so the source data is not already ordered.\n", |
134 | | - "data[\"id\"] = blosc2.arange(-data.shape[0] // 2, data.shape[0] // 2, dtype=np.float64)\n", |
135 | | - "rng.shuffle(data[\"id\"])\n", |
136 | | - "data[\"payload\"] = blosc2.arange(data.shape[0], dtype=np.int32)\n", |
| 132 | + "ids = np.arange(-N_ROWS // 2, N_ROWS // 2, dtype=np.float64)\n", |
| 133 | + "rng.shuffle(ids)\n", |
| 134 | + "data = blosc2.fromiter(((id_, i) for i, id_ in enumerate(ids)), shape=(N_ROWS,), dtype=dtype)\n", |
137 | 135 | "\n", |
138 | 136 | "indexed_arrays = {}\n", |
139 | 137 | "build_rows = []\n", |
140 | | - "base_cbytes = None\n", |
| 138 | + "base_cbytes = data.cbytes\n", |
141 | 139 | "for kind in (\"ultralight\", \"light\", \"medium\", \"full\"):\n", |
142 | | - " arr = blosc2.asarray(data)\n", |
143 | | - " if base_cbytes is None:\n", |
144 | | - " base_cbytes = arr.cbytes\n", |
| 140 | + " arr = data.copy()\n", |
145 | 141 | " t0 = time.perf_counter()\n", |
146 | 142 | " arr.create_index(field=\"id\", kind=kind)\n", |
147 | 143 | " build_ms = (time.perf_counter() - t0) * 1e3\n", |
|
161 | 157 | "text": [ |
162 | 158 | "Compressed base array size: 30.74 MiB\n", |
163 | 159 | "kind build_ms index_size overhead\n", |
164 | | - "ultralight 54.423 142 B 0.00x\n", |
165 | | - "light 748.307 26.04 MiB 0.85x\n", |
166 | | - "medium 2453.256 34.99 MiB 1.14x\n", |
167 | | - "full 8727.665 28.44 MiB 0.93x\n" |
| 160 | + "ultralight 45.783 142 B 0.00x\n", |
| 161 | + "light 674.304 26.04 MiB 0.85x\n", |
| 162 | + "medium 2195.323 34.99 MiB 1.14x\n", |
| 163 | + "full 8483.835 28.44 MiB 0.93x\n" |
168 | 164 | ] |
169 | 165 | } |
170 | 166 | ], |
171 | | - "execution_count": 12 |
| 167 | + "execution_count": 2 |
172 | 168 | }, |
173 | 169 | { |
174 | 170 | "cell_type": "markdown", |
|
185 | 181 | "id": "f1b3aaec965b42d6", |
186 | 182 | "metadata": { |
187 | 183 | "ExecuteTime": { |
188 | | - "end_time": "2026-04-09T02:55:34.424905Z", |
189 | | - "start_time": "2026-04-09T02:55:34.206800Z" |
| 184 | + "end_time": "2026-04-09T04:14:51.546053Z", |
| 185 | + "start_time": "2026-04-09T04:14:51.449229Z" |
190 | 186 | } |
191 | 187 | }, |
192 | 188 | "source": [ |
|
227 | 223 | "id": "c9e932b7561b4ff4", |
228 | 224 | "metadata": { |
229 | 225 | "ExecuteTime": { |
230 | | - "end_time": "2026-04-09T02:55:38.336928Z", |
231 | | - "start_time": "2026-04-09T02:55:34.432852Z" |
| 226 | + "end_time": "2026-04-09T04:14:53.105689Z", |
| 227 | + "start_time": "2026-04-09T04:14:51.548648Z" |
232 | 228 | } |
233 | 229 | }, |
234 | 230 | "source": [ |
|
258 | 254 | "text": [ |
259 | 255 | "Selective mask over 10,000,000 rows\n", |
260 | 256 | "kind scan_ms index_ms speedup\n", |
261 | | - "ultralight 161.539 174.253 0.93x\n", |
262 | | - "light 197.265 23.717 8.32x\n", |
263 | | - "medium 157.424 23.681 6.65x\n", |
264 | | - "full 157.527 25.280 6.23x\n" |
| 257 | + "ultralight 70.429 67.914 1.04x\n", |
| 258 | + "light 68.560 5.011 13.68x\n", |
| 259 | + "medium 68.481 4.430 15.46x\n", |
| 260 | + "full 68.408 4.263 16.05x\n" |
265 | 261 | ] |
266 | 262 | } |
267 | 263 | ], |
|
282 | 278 | "id": "9ffcb0d8d06a4daa", |
283 | 279 | "metadata": { |
284 | 280 | "ExecuteTime": { |
285 | | - "end_time": "2026-04-09T02:55:38.375308Z", |
286 | | - "start_time": "2026-04-09T02:55:38.346312Z" |
| 281 | + "end_time": "2026-04-09T04:14:53.160261Z", |
| 282 | + "start_time": "2026-04-09T04:14:53.118529Z" |
287 | 283 | } |
288 | 284 | }, |
289 | 285 | "source": [ |
|
327 | 323 | "id": "7d337ce2f9fb4f32", |
328 | 324 | "metadata": { |
329 | 325 | "ExecuteTime": { |
330 | | - "end_time": "2026-04-09T02:55:38.402494Z", |
331 | | - "start_time": "2026-04-09T02:55:38.376197Z" |
| 326 | + "end_time": "2026-04-09T04:14:53.206514Z", |
| 327 | + "start_time": "2026-04-09T04:14:53.171092Z" |
332 | 328 | } |
333 | 329 | }, |
334 | 330 | "source": [ |
|
374 | 370 | "id": "0be5f512928f48db", |
375 | 371 | "metadata": { |
376 | 372 | "ExecuteTime": { |
377 | | - "end_time": "2026-04-09T02:55:40.518254Z", |
378 | | - "start_time": "2026-04-09T02:55:38.403529Z" |
| 373 | + "end_time": "2026-04-09T04:14:55.722443Z", |
| 374 | + "start_time": "2026-04-09T04:14:53.207978Z" |
379 | 375 | } |
380 | 376 | }, |
381 | 377 | "source": [ |
|
392 | 388 | "name": "stdout", |
393 | 389 | "output_type": "stream", |
394 | 390 | "text": [ |
395 | | - "persistent medium: kind=medium, persistent=True, ooc=True, stale=False\n", |
396 | | - "Reopened index count: 1\n", |
397 | | - "Persisted sidecar path: indexing_tutorial_medium.__index__.id.medium.reduced.values.b2nd\n" |
| 391 | + "persistent medium: kind=medium, persistent=False, ooc=True, stale=False\n" |
| 392 | + ] |
| 393 | + }, |
| 394 | + { |
| 395 | + "ename": "FileNotFoundError", |
| 396 | + "evalue": "No such file or directory: indexing_tutorial_medium.b2nd", |
| 397 | + "output_type": "error", |
| 398 | + "traceback": [ |
| 399 | + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", |
| 400 | + "\u001B[31mFileNotFoundError\u001B[39m Traceback (most recent call last)", |
| 401 | + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[7]\u001B[39m\u001B[32m, line 5\u001B[39m\n\u001B[32m 2\u001B[39m persistent_descriptor = persistent_arr.create_index(field=\u001B[33m\"\u001B[39m\u001B[33mid\u001B[39m\u001B[33m\"\u001B[39m, kind=\u001B[33m\"\u001B[39m\u001B[33mmedium\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 3\u001B[39m show_index_summary(\u001B[33m\"\u001B[39m\u001B[33mpersistent medium\u001B[39m\u001B[33m\"\u001B[39m, persistent_descriptor)\n\u001B[32m----> \u001B[39m\u001B[32m5\u001B[39m reopened = \u001B[43mblosc2\u001B[49m\u001B[43m.\u001B[49m\u001B[43mopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpaths\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmode\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43ma\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 6\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mReopened index count: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mlen\u001B[39m(reopened.indexes)\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 7\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mPersisted sidecar path: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mreopened.indexes[\u001B[32m0\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mreduced\u001B[39m\u001B[33m'\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mvalues_path\u001B[39m\u001B[33m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n", |
| 402 | + "\u001B[36mFile \u001B[39m\u001B[32m~/blosc/python-blosc2/src/blosc2/schunk.py:1779\u001B[39m, in \u001B[36mopen\u001B[39m\u001B[34m(urlpath, mode, offset, **kwargs)\u001B[39m\n\u001B[32m 1776\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m special\n\u001B[32m 1778\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os.path.exists(urlpath):\n\u001B[32m-> \u001B[39m\u001B[32m1779\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mFileNotFoundError\u001B[39;00m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mNo such file or directory: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00murlpath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 1781\u001B[39m _set_default_dparams(kwargs)\n\u001B[32m 1782\u001B[39m res = blosc2_ext.open(urlpath, mode, offset, **kwargs)\n", |
| 403 | + "\u001B[31mFileNotFoundError\u001B[39m: No such file or directory: indexing_tutorial_medium.b2nd" |
398 | 404 | ] |
399 | 405 | } |
400 | 406 | ], |
|
413 | 419 | { |
414 | 420 | "cell_type": "code", |
415 | 421 | "id": "11f0cd1b910b409a", |
416 | | - "metadata": { |
417 | | - "ExecuteTime": { |
418 | | - "end_time": "2026-04-09T02:55:40.559125Z", |
419 | | - "start_time": "2026-04-09T02:55:40.527487Z" |
420 | | - } |
421 | | - }, |
| 422 | + "metadata": {}, |
422 | 423 | "source": [ |
423 | 424 | "mutable_arr = blosc2.arange(20, dtype=np.int64)\n", |
424 | 425 | "mutable_arr.create_index(kind=\"full\")\n", |
|
428 | 429 | "mutable_arr.rebuild_index()\n", |
429 | 430 | "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" |
430 | 431 | ], |
431 | | - "outputs": [ |
432 | | - { |
433 | | - "name": "stdout", |
434 | | - "output_type": "stream", |
435 | | - "text": [ |
436 | | - "Stale after direct mutation: True\n", |
437 | | - "Stale after rebuild: False\n" |
438 | | - ] |
439 | | - } |
440 | | - ], |
441 | | - "execution_count": 8 |
| 432 | + "outputs": [], |
| 433 | + "execution_count": null |
442 | 434 | }, |
443 | 435 | { |
444 | 436 | "cell_type": "markdown", |
|
462 | 454 | { |
463 | 455 | "cell_type": "code", |
464 | 456 | "id": "2e1a47a9cf7246e6", |
465 | | - "metadata": { |
466 | | - "ExecuteTime": { |
467 | | - "end_time": "2026-04-09T02:55:41.338819Z", |
468 | | - "start_time": "2026-04-09T02:55:40.559982Z" |
469 | | - } |
470 | | - }, |
| 457 | + "metadata": {}, |
471 | 458 | "source": [ |
472 | 459 | "append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n", |
473 | 460 | "base_rows = 200_000\n", |
|
505 | 492 | "print(f\"Median mask time after compaction: {after_ms:.3f} ms\")\n", |
506 | 493 | "print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")" |
507 | 494 | ], |
508 | | - "outputs": [ |
509 | | - { |
510 | | - "name": "stdout", |
511 | | - "output_type": "stream", |
512 | | - "text": [ |
513 | | - "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", |
514 | | - "Pending runs: 40\n", |
515 | | - "Median mask time before compaction: 3.514 ms\n", |
516 | | - "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", |
517 | | - "Pending runs: 0\n", |
518 | | - "Median mask time after compaction: 0.638 ms\n", |
519 | | - "Speedup after compaction: 5.51x\n" |
520 | | - ] |
521 | | - } |
522 | | - ], |
523 | | - "execution_count": 9 |
| 495 | + "outputs": [], |
| 496 | + "execution_count": null |
524 | 497 | }, |
525 | 498 | { |
526 | 499 | "cell_type": "markdown", |
|
540 | 513 | { |
541 | 514 | "cell_type": "code", |
542 | 515 | "id": "9833102355db4ec0", |
543 | | - "metadata": { |
544 | | - "ExecuteTime": { |
545 | | - "end_time": "2026-04-09T02:55:41.360620Z", |
546 | | - "start_time": "2026-04-09T02:55:41.350276Z" |
547 | | - } |
548 | | - }, |
| 516 | + "metadata": {}, |
549 | 517 | "source": [ |
550 | 518 | "for path in paths:\n", |
551 | 519 | " blosc2.remove_urlpath(path)" |
552 | 520 | ], |
553 | 521 | "outputs": [], |
554 | | - "execution_count": 10 |
| 522 | + "execution_count": null |
555 | 523 | }, |
556 | 524 | { |
557 | 525 | "cell_type": "code", |
558 | 526 | "id": "17489b2c3d2ac57", |
559 | | - "metadata": { |
560 | | - "ExecuteTime": { |
561 | | - "end_time": "2026-04-09T02:55:41.380135Z", |
562 | | - "start_time": "2026-04-09T02:55:41.368555Z" |
563 | | - } |
564 | | - }, |
| 527 | + "metadata": {}, |
565 | 528 | "source": [], |
566 | 529 | "outputs": [], |
567 | | - "execution_count": 10 |
| 530 | + "execution_count": null |
568 | 531 | } |
569 | 532 | ], |
570 | 533 | "metadata": { |
|
0 commit comments