Blosc
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README_DEVELOPERS.md‎
Lines changed: 30 additions & 0 deletions b/‎README_DEVELOPERS.md‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎bench/ndarray/matmul_path_compare.py‎
Lines changed: 171 additions & 37 deletions b/‎bench/ndarray/matmul_path_compare.py‎
Lines changed: 171 additions & 37 deletions
diff --git a/‎doc/reference/additional_funcs.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/reference/additional_funcs.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
@@ -50,6 +50,11 @@ add_custom_command(
 
 # ...and add it to the target
 Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
+target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c)
+target_include_directories(blosc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2)
+if(UNIX)
+  target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS})
+endif()
 Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI)
 
 # We need to link against NumPy
@@ -79,6 +84,9 @@ FetchContent_MakeAvailable(miniexpr)
 
 # Link against miniexpr static library
 target_link_libraries(blosc2_ext PRIVATE miniexpr_static)
+if(APPLE)
+  target_link_libraries(blosc2_ext PRIVATE "-framework Accelerate")
+endif()
 
 target_compile_features(blosc2_ext PRIVATE c_std_11)
 target_compile_features(indexing_ext PRIVATE c_std_11)
 
@@ -93,6 +93,36 @@ If you want to run the network tests, you can use the following command:
   pytest -m "network"
 ```
 
+## Matmul backend discovery
+
+The fast `blosc2.matmul` path uses platform-specific block kernels:
+
+- macOS: `Accelerate`
+- Linux/Windows: runtime-discovered `cblas`
+- fallback: portable `naive` kernel
+
+For the runtime `cblas` backend, `python-blosc2` probes the active Python/NumPy
+environment rather than linking to one BLAS vendor at build time.  Discovery
+starts from NumPy's reported BLAS library directory when available, and then
+searches common library names in the active environment's `lib` directories.
+
+On Linux the current candidates include `libcblas`, `libopenblas`,
+`libflexiblas`, `libblis`, `libmkl_rt`, and generic `libblas`.  A candidate is
+accepted only if it loads successfully and exports both `cblas_sgemm` and
+`cblas_dgemm`.  If no suitable provider is found, the fast path falls back to
+the `naive` kernel.
+
+Useful runtime helpers:
+
+- `blosc2.get_matmul_library()` reports the selected runtime library when available
+- `BLOSC_TRACE=1` logs candidate probing, rejection, selection, and backend fallback
+
+Example:
+
+```bash
+BLOSC_TRACE=1 python -c "import blosc2; print(blosc2.get_matmul_library())"
+```
+
 ## wasm32 / Pyodide developer workflow
 
 For the local wasm32 workflow (uv + pyodide-build + cibuildwheel + test loop),
 
@@ -55,7 +55,10 @@ def set_path_mode(mode: str) -> bool:
 
 
 def run_case(
+    label: str,
     mode: str,
+    block_backend: str,
+    warmup: int,
     repeats: int,
     shape_a: tuple[int, ...],
     shape_b: tuple[int, ...],
@@ -73,8 +76,10 @@ def run_case(
         warnings.simplefilter("ignore", RuntimeWarning)
         expected = np.matmul(a_np, b_np)
     original_flag = set_path_mode(mode)
+    original_block_backend = blosc2.blosc2_ext.get_matmul_block_backend()
     original_set_pref_matmul = blosc2.NDArray._set_pref_matmul
     selected_paths = []
+    selected_block_backend = None
     times = []
     result = None
 
@@ -83,7 +88,17 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
         return original_set_pref_matmul(self, inputs, fp_accuracy)
 
     blosc2.NDArray._set_pref_matmul = wrapped_set_pref_matmul
+    blosc2.blosc2_ext.set_matmul_block_backend(block_backend)
     try:
+        selected_block_backend = blosc2.blosc2_ext.get_selected_matmul_block_backend()
+        for _ in range(warmup):
+            before = len(selected_paths)
+            with warnings.catch_warnings():
+                # NumPy + Accelerate can emit spurious matmul RuntimeWarnings on macOS arm64.
+                warnings.simplefilter("ignore", RuntimeWarning)
+                result = blosc2.matmul(a, b, chunks=chunks_out, blocks=blocks_out)
+            if len(selected_paths) == before:
+                selected_paths.append("chunked")
         for _ in range(repeats):
             before = len(selected_paths)
             t0 = time.perf_counter()
@@ -97,6 +112,7 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
     finally:
         blosc2.NDArray._set_pref_matmul = original_set_pref_matmul
         linalg.try_miniexpr = original_flag
+        blosc2.blosc2_ext.set_matmul_block_backend(original_block_backend)
 
     if result is None:
         raise RuntimeError("matmul did not produce a result")
@@ -106,32 +122,90 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
 
     best = min(times)
     median = statistics.median(times)
+    selected_path = selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed"
+    reported_block_backend = selected_block_backend if selected_path != "chunked" else None
     return {
+        "label": label,
         "mode": mode,
         "times_s": times,
         "best_s": best,
         "median_s": median,
         "gflops_best": expected_gflops(shape_a, shape_b, best),
         "gflops_median": expected_gflops(shape_a, shape_b, median),
         "correct": True,
+        "configured_block_backend": block_backend,
+        "selected_block_backend": reported_block_backend,
         "selected_paths": selected_paths,
-        "selected_path": selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed",
+        "selected_path": selected_path,
+    }
+
+
+def run_numpy_case(
+    warmup: int,
+    repeats: int,
+    shape_a: tuple[int, ...],
+    shape_b: tuple[int, ...],
+    dtype: np.dtype,
+    chunks_a: tuple[int, ...] | None,
+    chunks_b: tuple[int, ...] | None,
+    blocks_a: tuple[int, ...] | None,
+    blocks_b: tuple[int, ...] | None,
+):
+    _, _, a_np, b_np = build_arrays(shape_a, shape_b, dtype, chunks_a, chunks_b, blocks_a, blocks_b)
+    times = []
+    result = None
+    for _ in range(warmup):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            result = np.matmul(a_np, b_np)
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", RuntimeWarning)
+            result = np.matmul(a_np, b_np)
+        times.append(time.perf_counter() - t0)
+
+    if result is None:
+        raise RuntimeError("numpy.matmul did not produce a result")
+
+    best = min(times)
+    median = statistics.median(times)
+    return {
+        "label": "numpy",
+        "mode": "numpy",
+        "times_s": times,
+        "best_s": best,
+        "median_s": median,
+        "gflops_best": expected_gflops(shape_a, shape_b, best),
+        "gflops_median": expected_gflops(shape_a, shape_b, median),
+        "correct": True,
+        "configured_block_backend": None,
+        "selected_block_backend": None,
+        "selected_paths": ["numpy"] * repeats,
+        "selected_path": "numpy",
     }
 
 
 def main() -> None:
     parser = argparse.ArgumentParser(description="Compare chunked and fast blosc2.matmul paths.")
-    parser.add_argument("--shape-a", default="400,400", help="Comma-separated shape for A.")
-    parser.add_argument("--shape-b", default="400,400", help="Comma-separated shape for B.")
+    parser.add_argument("--shape-a", default="2000,2000", help="Comma-separated shape for A.")
+    parser.add_argument("--shape-b", default="2000,2000", help="Comma-separated shape for B.")
     parser.add_argument("--dtype", default="float32", choices=["float32", "float64", "int32", "int64"])
-    parser.add_argument("--chunks-a", default="200,200", help="Comma-separated chunk shape for A.")
-    parser.add_argument("--chunks-b", default="200,200", help="Comma-separated chunk shape for B.")
+    parser.add_argument("--chunks-a", default="500,500", help="Comma-separated chunk shape for A.")
+    parser.add_argument("--chunks-b", default="500,500", help="Comma-separated chunk shape for B.")
     parser.add_argument("--blocks-a", default="100,100", help="Comma-separated block shape for A.")
     parser.add_argument("--blocks-b", default="100,100", help="Comma-separated block shape for B.")
-    parser.add_argument("--chunks-out", default="200,200", help="Comma-separated chunk shape for output.")
+    parser.add_argument("--chunks-out", default="500,500", help="Comma-separated chunk shape for output.")
     parser.add_argument("--blocks-out", default="100,100", help="Comma-separated block shape for output.")
-    parser.add_argument("--repeats", type=int, default=250)
+    parser.add_argument("--warmup", type=int, default=2)
+    parser.add_argument("--repeats", type=int, default=3)
     parser.add_argument("--modes", nargs="+", default=["chunked", "fast", "auto"], choices=["chunked", "fast", "auto"])
+    parser.add_argument(
+        "--block-backend",
+        default="auto",
+        choices=["auto", "naive", "accelerate", "cblas"],
+        help="Kernel backend for the fast matmul block path.",
+    )
     parser.add_argument("--json", action="store_true", help="Emit full JSON instead of a compact text summary.")
     args = parser.parse_args()
 
@@ -145,11 +219,27 @@ def main() -> None:
     blocks_out = parse_int_tuple(args.blocks_out) if args.blocks_out else None
     dtype = np.dtype(args.dtype)
 
+    print("Matmul path comparison")
+    print(f"  A shape: {shape_a}")
+    print(f"  B shape: {shape_b}")
+    print(f"  dtype: {dtype}")
+    print(f"  chunks A/B/out: {chunks_a} / {chunks_b} / {chunks_out}")
+    print(f"  blocks A/B/out: {blocks_a} / {blocks_b} / {blocks_out}")
+    print(f"  warmup: {args.warmup}")
+    print(f"  repeats: {args.repeats}")
+    print(f"  fast block backend: {args.block_backend}")
+    print(f"  matmul library: {blosc2.get_matmul_library()}")
+    print()
+    print("Results:")
+
     results = []
     for mode in args.modes:
         results.append(
             run_case(
                 mode,
+                mode,
+                args.block_backend,
+                args.warmup,
                 args.repeats,
                 shape_a,
                 shape_b,
@@ -163,6 +253,42 @@ def main() -> None:
             )
         )
 
+    if args.block_backend == "auto" and "fast" in args.modes:
+        fast_naive = run_case(
+            "fast-naive",
+            "fast",
+            "naive",
+            args.warmup,
+            args.repeats,
+            shape_a,
+            shape_b,
+            dtype,
+            chunks_a,
+            chunks_b,
+            blocks_a,
+            blocks_b,
+            chunks_out,
+            blocks_out,
+        )
+        if fast_naive["selected_block_backend"] != next(
+            item["selected_block_backend"] for item in results if item["mode"] == "fast"
+        ):
+            results.append(fast_naive)
+
+    results.append(
+        run_numpy_case(
+            args.warmup,
+            args.repeats,
+            shape_a,
+            shape_b,
+            dtype,
+            chunks_a,
+            chunks_b,
+            blocks_a,
+            blocks_b,
+        )
+    )
+
     summary = {
         "shape_a": shape_a,
         "shape_b": shape_b,
@@ -173,47 +299,55 @@ def main() -> None:
         "blocks_b": blocks_b,
         "chunks_out": chunks_out,
         "blocks_out": blocks_out,
+        "block_backend": args.block_backend,
         "results": results,
     }
 
-    best_by_mode = {item["mode"]: item["best_s"] for item in results}
-    if "chunked" in best_by_mode and "fast" in best_by_mode:
-        summary["speedup_fast_vs_chunked"] = best_by_mode["chunked"] / best_by_mode["fast"]
+    best_by_label = {item["label"]: item["best_s"] for item in results}
+    if "chunked" in best_by_label and "fast" in best_by_label:
+        summary["speedup_fast_vs_chunked"] = best_by_label["chunked"] / best_by_label["fast"]
+    if "chunked" in best_by_label and "fast-naive" in best_by_label:
+        summary["speedup_fast_naive_vs_chunked"] = best_by_label["chunked"] / best_by_label["fast-naive"]
+    if "fast" in best_by_label and "fast-naive" in best_by_label:
+        summary["speedup_fast_vs_fast_naive"] = best_by_label["fast-naive"] / best_by_label["fast"]
+    if "numpy" in best_by_label and "fast" in best_by_label:
+        summary["speedup_fast_vs_numpy"] = best_by_label["numpy"] / best_by_label["fast"]
+    if "numpy" in best_by_label and "auto" in best_by_label:
+        summary["speedup_auto_vs_numpy"] = best_by_label["numpy"] / best_by_label["auto"]
 
     if args.json:
         print(json.dumps(summary, indent=2, sort_keys=True))
         return
 
-    print(
-        "case",
-        json.dumps(
-            {
-                "shape_a": shape_a,
-                "shape_b": shape_b,
-                "dtype": str(dtype),
-                "chunks_out": chunks_out,
-                "blocks_out": blocks_out,
-            },
-            sort_keys=True,
-        ),
-    )
-    for item in results:
+    display_order = ["chunked", "fast-naive", "fast", "auto", "numpy"]
+    ordered_results = sorted(results, key=lambda item: display_order.index(item["label"]) if item["label"] in display_order else len(display_order))
+
+    for item in ordered_results:
+        gflops_best = "-" if item["gflops_best"] is None else f"{item['gflops_best']:.3f}"
+        if item["label"] == "numpy":
+            backend_info = f"library={blosc2.get_matmul_library()}"
+        else:
+            block_backend = item["selected_block_backend"] if item["selected_block_backend"] is not None else "-"
+            backend_info = f"block_backend={block_backend}"
         print(
-            "result",
-            json.dumps(
-                {
-                    "mode": item["mode"],
-                    "best_s": round(item["best_s"], 6),
-                    "median_s": round(item["median_s"], 6),
-                    "gflops_best": None if item["gflops_best"] is None else round(item["gflops_best"], 3),
-                    "correct": item["correct"],
-                    "selected_path": item["selected_path"],
-                },
-                sort_keys=True,
-            ),
+            f"{item['label']:>10}: "
+            f"best={item['best_s']:.6f}s "
+            f"median={item['median_s']:.6f}s "
+            f"gflops={gflops_best} "
+            f"path={item['selected_path']} "
+            f"{backend_info} "
+            f"correct={item['correct']}"
         )
     if "speedup_fast_vs_chunked" in summary:
-        print("speedup", json.dumps({"fast_vs_chunked": round(summary["speedup_fast_vs_chunked"], 3)}, sort_keys=True))
+        print(f"Speedup fast vs chunked: {summary['speedup_fast_vs_chunked']:.3f}x")
+    if "speedup_fast_naive_vs_chunked" in summary:
+        print(f"Speedup fast-naive vs chunked: {summary['speedup_fast_naive_vs_chunked']:.3f}x")
+    if "speedup_fast_vs_fast_naive" in summary:
+        print(f"Speedup fast vs fast-naive: {summary['speedup_fast_vs_fast_naive']:.3f}x")
+    if "speedup_fast_vs_numpy" in summary:
+        print(f"Speedup fast vs numpy: {summary['speedup_fast_vs_numpy']:.3f}x")
+    if "speedup_auto_vs_numpy" in summary:
+        print(f"Speedup auto vs numpy: {summary['speedup_auto_vs_numpy']:.3f}x")
 
 
 if __name__ == "__main__":
 
@@ -16,6 +16,7 @@ Their result is typically a :ref:`LazyExpr` instance, which can be evaluated (wi
     clip
     contains
     endswith
+    get_matmul_library
     imag
     lower
     real
@@ -30,6 +31,7 @@ Their result is typically a :ref:`LazyExpr` instance, which can be evaluated (wi
 .. autofunction:: blosc2.clip
 .. autofunction:: blosc2.contains
 .. autofunction:: blosc2.endswith
+.. autofunction:: blosc2.get_matmul_library
 .. autofunction:: blosc2.imag
 .. autofunction:: blosc2.lower
 .. autofunction:: blosc2.real
 
@@ -38,6 +38,7 @@ dependencies = [
     "msgpack",
     "numexpr>=2.14.1; platform_machine != 'wasm32'",
     "requests",
+    "threadpoolctl; platform_machine != 'wasm32'",
 ]
 version = "4.1.1.dev0"
 [project.entry-points."array_api"]
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ dependencies = [`
`38`	`38`	`"msgpack",`
`39`	`39`	`"numexpr>=2.14.1; platform_machine != 'wasm32'",`
`40`	`40`	`"requests",`
	`41`	`+ "threadpoolctl; platform_machine != 'wasm32'",`
`41`	`42`	`]`
`42`	`43`	`version = "4.1.1.dev0"`
`43`	`44`	`[project.entry-points."array_api"]`