Skip to content

Commit daa568f

Browse files
committed
Add Linux runtime CBLAS matmul backend, threading policy, and diagnostics
Implement a runtime-discovered CBLAS backend for the matmul fast path on Linux/Windows, alongside the existing Accelerate/macOS path and naive fallback. Probe BLAS candidates from the active NumPy/conda environment, load providers exporting cblas_sgemm/cblas_dgemm, and fall back cleanly to naive when none fit. Control nested BLAS threading from Python with threadpoolctl around fast blosc2.matmul calls, but only for Linux CBLAS and only for small blocks. Use a benchmark-derived threshold of 192x192 to keep BLAS single-threaded for small GEMMs while avoiding regressions on larger ones; never apply this on macOS. Expose backend introspection via blosc2.get_matmul_library(), returning the loaded CBLAS library path or Accelerate.framework when available. Add BLOSC_TRACE diagnostics for CBLAS candidate probing, rejection, selection, and backend fallback decisions. Extend matmul benchmarks to report the active matmul library, compare against plain NumPy matmul, support warmup iterations, and use larger default problem sizes for steadier out-of-the-box results. Add tests covering backend selection, threadpoolctl usage/skips, threshold behavior, Darwin scoping, and matmul-library introspection; add threadpoolctl as a regular non-wasm dependency.
1 parent 8c0890f commit daa568f

9 files changed

Lines changed: 643 additions & 28 deletions

File tree

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ add_custom_command(
4545
Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI)
4646
target_sources(blosc2_ext PRIVATE src/blosc2/matmul_kernels.c)
4747
target_include_directories(blosc2_ext PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2)
48+
if(UNIX)
49+
target_link_libraries(blosc2_ext PRIVATE ${CMAKE_DL_LIBS})
50+
endif()
4851

4952
# We need to link against NumPy
5053
target_link_libraries(blosc2_ext PRIVATE Python::NumPy)

bench/ndarray/matmul_path_compare.py

Lines changed: 111 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def run_case(
5858
label: str,
5959
mode: str,
6060
block_backend: str,
61+
warmup: int,
6162
repeats: int,
6263
shape_a: tuple[int, ...],
6364
shape_b: tuple[int, ...],
@@ -90,6 +91,14 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
9091
blosc2.blosc2_ext.set_matmul_block_backend(block_backend)
9192
try:
9293
selected_block_backend = blosc2.blosc2_ext.get_selected_matmul_block_backend()
94+
for _ in range(warmup):
95+
before = len(selected_paths)
96+
with warnings.catch_warnings():
97+
# NumPy + Accelerate can emit spurious matmul RuntimeWarnings on macOS arm64.
98+
warnings.simplefilter("ignore", RuntimeWarning)
99+
result = blosc2.matmul(a, b, chunks=chunks_out, blocks=blocks_out)
100+
if len(selected_paths) == before:
101+
selected_paths.append("chunked")
93102
for _ in range(repeats):
94103
before = len(selected_paths)
95104
t0 = time.perf_counter()
@@ -113,6 +122,8 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
113122

114123
best = min(times)
115124
median = statistics.median(times)
125+
selected_path = selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed"
126+
reported_block_backend = selected_block_backend if selected_path != "chunked" else None
116127
return {
117128
"label": label,
118129
"mode": mode,
@@ -123,29 +134,76 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
123134
"gflops_median": expected_gflops(shape_a, shape_b, median),
124135
"correct": True,
125136
"configured_block_backend": block_backend,
126-
"selected_block_backend": selected_block_backend,
137+
"selected_block_backend": reported_block_backend,
127138
"selected_paths": selected_paths,
128-
"selected_path": selected_paths[0] if selected_paths and len(set(selected_paths)) == 1 else "mixed",
139+
"selected_path": selected_path,
140+
}
141+
142+
143+
def run_numpy_case(
144+
warmup: int,
145+
repeats: int,
146+
shape_a: tuple[int, ...],
147+
shape_b: tuple[int, ...],
148+
dtype: np.dtype,
149+
chunks_a: tuple[int, ...] | None,
150+
chunks_b: tuple[int, ...] | None,
151+
blocks_a: tuple[int, ...] | None,
152+
blocks_b: tuple[int, ...] | None,
153+
):
154+
_, _, a_np, b_np = build_arrays(shape_a, shape_b, dtype, chunks_a, chunks_b, blocks_a, blocks_b)
155+
times = []
156+
result = None
157+
for _ in range(warmup):
158+
with warnings.catch_warnings():
159+
warnings.simplefilter("ignore", RuntimeWarning)
160+
result = np.matmul(a_np, b_np)
161+
for _ in range(repeats):
162+
t0 = time.perf_counter()
163+
with warnings.catch_warnings():
164+
warnings.simplefilter("ignore", RuntimeWarning)
165+
result = np.matmul(a_np, b_np)
166+
times.append(time.perf_counter() - t0)
167+
168+
if result is None:
169+
raise RuntimeError("numpy.matmul did not produce a result")
170+
171+
best = min(times)
172+
median = statistics.median(times)
173+
return {
174+
"label": "numpy",
175+
"mode": "numpy",
176+
"times_s": times,
177+
"best_s": best,
178+
"median_s": median,
179+
"gflops_best": expected_gflops(shape_a, shape_b, best),
180+
"gflops_median": expected_gflops(shape_a, shape_b, median),
181+
"correct": True,
182+
"configured_block_backend": None,
183+
"selected_block_backend": None,
184+
"selected_paths": ["numpy"] * repeats,
185+
"selected_path": "numpy",
129186
}
130187

131188

132189
def main() -> None:
133190
parser = argparse.ArgumentParser(description="Compare chunked and fast blosc2.matmul paths.")
134-
parser.add_argument("--shape-a", default="400,400", help="Comma-separated shape for A.")
135-
parser.add_argument("--shape-b", default="400,400", help="Comma-separated shape for B.")
191+
parser.add_argument("--shape-a", default="2000,2000", help="Comma-separated shape for A.")
192+
parser.add_argument("--shape-b", default="2000,2000", help="Comma-separated shape for B.")
136193
parser.add_argument("--dtype", default="float32", choices=["float32", "float64", "int32", "int64"])
137-
parser.add_argument("--chunks-a", default="200,200", help="Comma-separated chunk shape for A.")
138-
parser.add_argument("--chunks-b", default="200,200", help="Comma-separated chunk shape for B.")
194+
parser.add_argument("--chunks-a", default="500,500", help="Comma-separated chunk shape for A.")
195+
parser.add_argument("--chunks-b", default="500,500", help="Comma-separated chunk shape for B.")
139196
parser.add_argument("--blocks-a", default="100,100", help="Comma-separated block shape for A.")
140197
parser.add_argument("--blocks-b", default="100,100", help="Comma-separated block shape for B.")
141-
parser.add_argument("--chunks-out", default="200,200", help="Comma-separated chunk shape for output.")
198+
parser.add_argument("--chunks-out", default="500,500", help="Comma-separated chunk shape for output.")
142199
parser.add_argument("--blocks-out", default="100,100", help="Comma-separated block shape for output.")
143-
parser.add_argument("--repeats", type=int, default=250)
200+
parser.add_argument("--warmup", type=int, default=2)
201+
parser.add_argument("--repeats", type=int, default=1)
144202
parser.add_argument("--modes", nargs="+", default=["chunked", "fast", "auto"], choices=["chunked", "fast", "auto"])
145203
parser.add_argument(
146204
"--block-backend",
147205
default="auto",
148-
choices=["auto", "naive", "accelerate"],
206+
choices=["auto", "naive", "accelerate", "cblas"],
149207
help="Kernel backend for the fast matmul block path.",
150208
)
151209
parser.add_argument("--json", action="store_true", help="Emit full JSON instead of a compact text summary.")
@@ -161,13 +219,27 @@ def main() -> None:
161219
blocks_out = parse_int_tuple(args.blocks_out) if args.blocks_out else None
162220
dtype = np.dtype(args.dtype)
163221

222+
print("Matmul path comparison")
223+
print(f" A shape: {shape_a}")
224+
print(f" B shape: {shape_b}")
225+
print(f" dtype: {dtype}")
226+
print(f" chunks A/B/out: {chunks_a} / {chunks_b} / {chunks_out}")
227+
print(f" blocks A/B/out: {blocks_a} / {blocks_b} / {blocks_out}")
228+
print(f" warmup: {args.warmup}")
229+
print(f" repeats: {args.repeats}")
230+
print(f" fast block backend: {args.block_backend}")
231+
print(f" matmul library: {blosc2.get_matmul_library()}")
232+
print()
233+
print("Results:")
234+
164235
results = []
165236
for mode in args.modes:
166237
results.append(
167238
run_case(
168239
mode,
169240
mode,
170241
args.block_backend,
242+
args.warmup,
171243
args.repeats,
172244
shape_a,
173245
shape_b,
@@ -186,6 +258,7 @@ def main() -> None:
186258
"fast-naive",
187259
"fast",
188260
"naive",
261+
args.warmup,
189262
args.repeats,
190263
shape_a,
191264
shape_b,
@@ -202,6 +275,20 @@ def main() -> None:
202275
):
203276
results.append(fast_naive)
204277

278+
results.append(
279+
run_numpy_case(
280+
args.warmup,
281+
args.repeats,
282+
shape_a,
283+
shape_b,
284+
dtype,
285+
chunks_a,
286+
chunks_b,
287+
blocks_a,
288+
blocks_b,
289+
)
290+
)
291+
205292
summary = {
206293
"shape_a": shape_a,
207294
"shape_b": shape_b,
@@ -223,31 +310,32 @@ def main() -> None:
223310
summary["speedup_fast_naive_vs_chunked"] = best_by_label["chunked"] / best_by_label["fast-naive"]
224311
if "fast" in best_by_label and "fast-naive" in best_by_label:
225312
summary["speedup_fast_vs_fast_naive"] = best_by_label["fast-naive"] / best_by_label["fast"]
313+
if "numpy" in best_by_label and "fast" in best_by_label:
314+
summary["speedup_fast_vs_numpy"] = best_by_label["numpy"] / best_by_label["fast"]
315+
if "numpy" in best_by_label and "auto" in best_by_label:
316+
summary["speedup_auto_vs_numpy"] = best_by_label["numpy"] / best_by_label["auto"]
226317

227318
if args.json:
228319
print(json.dumps(summary, indent=2, sort_keys=True))
229320
return
230321

231-
print("Matmul path comparison")
232-
print(f" A shape: {shape_a}")
233-
print(f" B shape: {shape_b}")
234-
print(f" dtype: {dtype}")
235-
print(f" chunks A/B/out: {chunks_a} / {chunks_b} / {chunks_out}")
236-
print(f" blocks A/B/out: {blocks_a} / {blocks_b} / {blocks_out}")
237-
print(f" repeats: {args.repeats}")
238-
print(f" fast block backend: {args.block_backend}")
239-
display_order = ["chunked", "fast-naive", "fast", "auto"]
322+
display_order = ["chunked", "fast-naive", "fast", "auto", "numpy"]
240323
ordered_results = sorted(results, key=lambda item: display_order.index(item["label"]) if item["label"] in display_order else len(display_order))
241324

242325
for item in ordered_results:
243326
gflops_best = "-" if item["gflops_best"] is None else f"{item['gflops_best']:.3f}"
327+
if item["label"] == "numpy":
328+
backend_info = f"library={blosc2.get_matmul_library()}"
329+
else:
330+
block_backend = item["selected_block_backend"] if item["selected_block_backend"] is not None else "-"
331+
backend_info = f"block_backend={block_backend}"
244332
print(
245333
f"{item['label']:>10}: "
246334
f"best={item['best_s']:.6f}s "
247335
f"median={item['median_s']:.6f}s "
248336
f"gflops={gflops_best} "
249337
f"path={item['selected_path']} "
250-
f"block_backend={item['selected_block_backend']} "
338+
f"{backend_info} "
251339
f"correct={item['correct']}"
252340
)
253341
if "speedup_fast_vs_chunked" in summary:
@@ -256,6 +344,10 @@ def main() -> None:
256344
print(f"Speedup fast-naive vs chunked: {summary['speedup_fast_naive_vs_chunked']:.3f}x")
257345
if "speedup_fast_vs_fast_naive" in summary:
258346
print(f"Speedup fast vs fast-naive: {summary['speedup_fast_vs_fast_naive']:.3f}x")
347+
if "speedup_fast_vs_numpy" in summary:
348+
print(f"Speedup fast vs numpy: {summary['speedup_fast_vs_numpy']:.3f}x")
349+
if "speedup_auto_vs_numpy" in summary:
350+
print(f"Speedup auto vs numpy: {summary['speedup_auto_vs_numpy']:.3f}x")
259351

260352

261353
if __name__ == "__main__":

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ dependencies = [
3838
"msgpack",
3939
"numexpr>=2.14.1; platform_machine != 'wasm32'",
4040
"requests",
41+
"threadpoolctl; platform_machine != 'wasm32'",
4142
]
4243
version = "4.1.1.dev0"
4344
[project.entry-points."array_api"]

src/blosc2/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,30 @@ def _configure_libtcc_runtime_path():
8787
"""
8888

8989

90+
def get_matmul_library() -> str | None:
91+
"""
92+
Return the library used by the active matmul fast backend, if any.
93+
94+
Returns
95+
-------
96+
str | None
97+
``"Accelerate.framework"`` when the selected backend is Accelerate,
98+
the loaded CBLAS library path for runtime-discovered CBLAS backends,
99+
or ``None`` when the selected backend is ``naive``.
100+
"""
101+
from . import blosc2_ext
102+
103+
selected_backend = blosc2_ext.get_selected_matmul_block_backend()
104+
if selected_backend == "accelerate":
105+
return "Accelerate.framework"
106+
if selected_backend == "cblas":
107+
get_loaded_cblas = getattr(blosc2_ext, "get_loaded_matmul_cblas_library", None)
108+
if get_loaded_cblas is None:
109+
return None
110+
return get_loaded_cblas()
111+
return None
112+
113+
90114
class Codec(Enum):
91115
"""
92116
Available codecs.
@@ -837,6 +861,7 @@ def _raise(exc):
837861
"get_compressor",
838862
"get_cpu_info",
839863
"get_expr_operands",
864+
"get_matmul_library",
840865
"get_slice_nchunks",
841866
"greater",
842867
"greater_equal",

0 commit comments

Comments
 (0)