Blosc
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bench/ndarray/matmul_path_compare.py‎
Lines changed: 111 additions & 19 deletions b/‎bench/ndarray/matmul_path_compare.py‎
Lines changed: 111 additions & 19 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/blosc2/__init__.py‎
Lines changed: 25 additions & 0 deletions b/‎src/blosc2/__init__.py‎
Lines changed: 25 additions & 0 deletions
@@ -45,6 +45,9 @@ add_custom_command(
 Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
 target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c)
 target_include_directories(blosc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2)
+if(UNIX)
+  target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS})
+endif()
 
 # We need to link against NumPy
 target_link_libraries(blosc2_ext PRIVATE Python::NumPy)
 
@@ -58,6 +58,7 @@ def run_case(
     label: str,
     mode: str,
     block_backend: str,
+    warmup: int,
     repeats: int,
     shape_a: tuple[int, ...],
     shape_b: tuple[int, ...],
@@ -90,6 +91,14 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
     blosc2.blosc2_ext.set_matmul_block_backend(block_backend)
     try:
         selected_block_backend = blosc2.blosc2_ext.get_selected_matmul_block_backend()
+        for _ in range(warmup):
+            before = len(selected_paths)
+            with warnings.catch_warnings():
+                # NumPy + Accelerate can emit spurious matmul RuntimeWarnings on macOS arm64.
+                warnings.simplefilter("ignore", RuntimeWarning)
+                result = blosc2.matmul(a, b, chunks=chunks_out, blocks=blocks_out)
+            if len(selected_paths) == before:
+                selected_paths.append("chunked")
         for _ in range(repeats):
             before = len(selected_paths)
             t0 = time.perf_counter()
@@ -113,6 +122,8 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
 
     best = min(times)
     median = statistics.median(times)
+    selected_path = selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed"
+    reported_block_backend = selected_block_backend if selected_path != "chunked" else None
     return {
         "label": label,
         "mode": mode,
@@ -123,29 +134,76 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
         "gflops_median": expected_gflops(shape_a, shape_b, median),
         "correct": True,
         "configured_block_backend": block_backend,
-        "selected_block_backend": selected_block_backend,
+        "selected_block_backend": reported_block_backend,
         "selected_paths": selected_paths,
-        "selected_path": selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed",
+        "selected_path": selected_path,
+    }
+
+
+def run_numpy_case(
+    warmup: int,
+    repeats: int,
+    shape_a: tuple[int, ...],
+    shape_b: tuple[int, ...],
+    dtype: np.dtype,
+    chunks_a: tuple[int, ...] | None,
+    chunks_b: tuple[int, ...] | None,
+    blocks_a: tuple[int, ...] | None,
+    blocks_b: tuple[int, ...] | None,
+):
+    _, _, a_np, b_np = build_arrays(shape_a, shape_b, dtype, chunks_a, chunks_b, blocks_a, blocks_b)
+    times = []
+    result = None
+    for _ in range(warmup):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            result = np.matmul(a_np, b_np)
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            result = np.matmul(a_np, b_np)
+        times.append(time.perf_counter() - t0)
+
+    if result is None:
+        raise RuntimeError("numpy.matmul did not produce a result")
+
+    best = min(times)
+    median = statistics.median(times)
+    return {
+        "label": "numpy",
+        "mode": "numpy",
+        "times_s": times,
+        "best_s": best,
+        "median_s": median,
+        "gflops_best": expected_gflops(shape_a, shape_b, best),
+        "gflops_median": expected_gflops(shape_a, shape_b, median),
+        "correct": True,
+        "configured_block_backend": None,
+        "selected_block_backend": None,
+        "selected_paths": ["numpy"] * repeats,
+        "selected_path": "numpy",
     }
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Compare chunked and fast blosc2.matmul paths.")
-    parser.add_argument("--shape-a", default="400,400", help="Comma-separated shape for A.")
-    parser.add_argument("--shape-b", default="400,400", help="Comma-separated shape for B.")
+    parser.add_argument("--shape-a", default="2000,2000", help="Comma-separated shape for A.")
+    parser.add_argument("--shape-b", default="2000,2000", help="Comma-separated shape for B.")
     parser.add_argument("--dtype", default="float32", choices=["float32", "float64", "int32", "int64"])
-    parser.add_argument("--chunks-a", default="200,200", help="Comma-separated chunk shape for A.")
-    parser.add_argument("--chunks-b", default="200,200", help="Comma-separated chunk shape for B.")
+    parser.add_argument("--chunks-a", default="500,500", help="Comma-separated chunk shape for A.")
+    parser.add_argument("--chunks-b", default="500,500", help="Comma-separated chunk shape for B.")
     parser.add_argument("--blocks-a", default="100,100", help="Comma-separated block shape for A.")
     parser.add_argument("--blocks-b", default="100,100", help="Comma-separated block shape for B.")
-    parser.add_argument("--chunks-out", default="200,200", help="Comma-separated chunk shape for output.")
+    parser.add_argument("--chunks-out", default="500,500", help="Comma-separated chunk shape for output.")
     parser.add_argument("--blocks-out", default="100,100", help="Comma-separated block shape for output.")
-    parser.add_argument("--repeats", type=int, default=250)
+    parser.add_argument("--warmup", type=int, default=2)
+    parser.add_argument("--repeats", type=int, default=1)
     parser.add_argument("--modes", nargs="+", default=["chunked", "fast", "auto"], choices=["chunked", "fast", "auto"])
     parser.add_argument(
         "--block-backend",
         default="auto",
-        choices=["auto", "naive", "accelerate"],
+        choices=["auto", "naive", "accelerate", "cblas"],
         help="Kernel backend for the fast matmul block path.",
     )
     parser.add_argument("--json", action="store_true", help="Emit full JSON instead of a compact text summary.")
@@ -161,13 +219,27 @@ def main() -> None:
     blocks_out = parse_int_tuple(args.blocks_out) if args.blocks_out else None
     dtype = np.dtype(args.dtype)
 
+    print("Matmul path comparison")
+    print(f"  A shape: {shape_a}")
+    print(f"  B shape: {shape_b}")
+    print(f"  dtype: {dtype}")
+    print(f"  chunks A/B/out: {chunks_a} / {chunks_b} / {chunks_out}")
+    print(f"  blocks A/B/out: {blocks_a} / {blocks_b} / {blocks_out}")
+    print(f"  warmup: {args.warmup}")
+    print(f"  repeats: {args.repeats}")
+    print(f"  fast block backend: {args.block_backend}")
+    print(f"  matmul library: {blosc2.get_matmul_library()}")
+    print()
+    print("Results:")
+
     results = []
     for mode in args.modes:
         results.append(
             run_case(
                 mode,
                 mode,
                 args.block_backend,
+                args.warmup,
                 args.repeats,
                 shape_a,
                 shape_b,
@@ -186,6 +258,7 @@ def main() -> None:
             "fast-naive",
             "fast",
             "naive",
+            args.warmup,
             args.repeats,
             shape_a,
             shape_b,
@@ -202,6 +275,20 @@ def main() -> None:
         ):
             results.append(fast_naive)
 
+    results.append(
+        run_numpy_case(
+            args.warmup,
+            args.repeats,
+            shape_a,
+            shape_b,
+            dtype,
+            chunks_a,
+            chunks_b,
+            blocks_a,
+            blocks_b,
+        )
+    )
+
     summary = {
         "shape_a": shape_a,
         "shape_b": shape_b,
@@ -223,31 +310,32 @@ def main() -> None:
         summary["speedup_fast_naive_vs_chunked"] = best_by_label["chunked"] / best_by_label["fast-naive"]
     if "fast" in best_by_label and "fast-naive" in best_by_label:
         summary["speedup_fast_vs_fast_naive"] = best_by_label["fast-naive"] / best_by_label["fast"]
+    if "numpy" in best_by_label and "fast" in best_by_label:
+        summary["speedup_fast_vs_numpy"] = best_by_label["numpy"] / best_by_label["fast"]
+    if "numpy" in best_by_label and "auto" in best_by_label:
+        summary["speedup_auto_vs_numpy"] = best_by_label["numpy"] / best_by_label["auto"]
 
     if args.json:
         print(json.dumps(summary, indent=2, sort_keys=True))
         return
 
-    print("Matmul path comparison")
-    print(f"  A shape: {shape_a}")
-    print(f"  B shape: {shape_b}")
-    print(f"  dtype: {dtype}")
-    print(f"  chunks A/B/out: {chunks_a} / {chunks_b} / {chunks_out}")
-    print(f"  blocks A/B/out: {blocks_a} / {blocks_b} / {blocks_out}")
-    print(f"  repeats: {args.repeats}")
-    print(f"  fast block backend: {args.block_backend}")
-    display_order = ["chunked", "fast-naive", "fast", "auto"]
+    display_order = ["chunked", "fast-naive", "fast", "auto", "numpy"]
     ordered_results = sorted(results, key=lambda item: display_order.index(item["label"]) if item["label"] in display_order else len(display_order))
 
     for item in ordered_results:
         gflops_best = "-" if item["gflops_best"] is None else f"{item['gflops_best']:.3f}"
+        if item["label"] == "numpy":
+            backend_info = f"library={blosc2.get_matmul_library()}"
+        else:
+            block_backend = item["selected_block_backend"] if item["selected_block_backend"] is not None else "-"
+            backend_info = f"block_backend={block_backend}"
         print(
             f"{item['label']:>10}: "
             f"best={item['best_s']:.6f}s "
             f"median={item['median_s']:.6f}s "
             f"gflops={gflops_best} "
             f"path={item['selected_path']} "
-            f"block_backend={item['selected_block_backend']} "
+            f"{backend_info} "
             f"correct={item['correct']}"
         )
     if "speedup_fast_vs_chunked" in summary:
@@ -256,6 +344,10 @@ def main() -> None:
         print(f"Speedup fast-naive vs chunked: {summary['speedup_fast_naive_vs_chunked']:.3f}x")
     if "speedup_fast_vs_fast_naive" in summary:
         print(f"Speedup fast vs fast-naive: {summary['speedup_fast_vs_fast_naive']:.3f}x")
+    if "speedup_fast_vs_numpy" in summary:
+        print(f"Speedup fast vs numpy: {summary['speedup_fast_vs_numpy']:.3f}x")
+    if "speedup_auto_vs_numpy" in summary:
+        print(f"Speedup auto vs numpy: {summary['speedup_auto_vs_numpy']:.3f}x")
 
 
 if __name__ == "__main__":
 
@@ -38,6 +38,7 @@ dependencies = [
     "msgpack",
     "numexpr>=2.14.1; platform_machine != 'wasm32'",
     "requests",
+    "threadpoolctl; platform_machine != 'wasm32'",
 ]
 version = "4.1.1.dev0"
 [project.entry-points."array_api"]
 
@@ -87,6 +87,30 @@ def _configure_libtcc_runtime_path():
 """
 
 
+def get_matmul_library() -> str | None:
+    """
+    Return the library used by the active matmul fast backend, if any.
+
+    Returns
+    -------
+    str | None
+        ``"Accelerate.framework"`` when the selected backend is Accelerate,
+        the loaded CBLAS library path for runtime-discovered CBLAS backends,
+        or ``None`` when the selected backend is ``naive``.
+    """
+    from . import blosc2_ext
+
+    selected_backend = blosc2_ext.get_selected_matmul_block_backend()
+    if selected_backend == "accelerate":
+        return "Accelerate.framework"
+    if selected_backend == "cblas":
+        get_loaded_cblas = getattr(blosc2_ext, "get_loaded_matmul_cblas_library", None)
+        if get_loaded_cblas is None:
+            return None
+        return get_loaded_cblas()
+    return None
+
+
 class Codec(Enum):
     """
     Available codecs.
@@ -837,6 +861,7 @@ def _raise(exc):
     "get_compressor",
     "get_cpu_info",
     "get_expr_operands",
+    "get_matmul_library",
     "get_slice_nchunks",
     "greater",
     "greater_equal",
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ dependencies = [`
`38`	`38`	`"msgpack",`
`39`	`39`	`"numexpr>=2.14.1; platform_machine != 'wasm32'",`
`40`	`40`	`"requests",`
	`41`	`+ "threadpoolctl; platform_machine != 'wasm32'",`
`41`	`42`	`]`
`42`	`43`	`version = "4.1.1.dev0"`
`43`	`44`	`[project.entry-points."array_api"]`