@@ -58,6 +58,7 @@ def run_case(
5858 label : str ,
5959 mode : str ,
6060 block_backend : str ,
61+ warmup : int ,
6162 repeats : int ,
6263 shape_a : tuple [int , ...],
6364 shape_b : tuple [int , ...],
@@ -90,6 +91,14 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
9091 blosc2 .blosc2_ext .set_matmul_block_backend (block_backend )
9192 try :
9293 selected_block_backend = blosc2 .blosc2_ext .get_selected_matmul_block_backend ()
94+ for _ in range (warmup ):
95+ before = len (selected_paths )
96+ with warnings .catch_warnings ():
97+ # NumPy + Accelerate can emit spurious matmul RuntimeWarnings on macOS arm64.
98+ warnings .simplefilter ("ignore" , RuntimeWarning )
99+ result = blosc2 .matmul (a , b , chunks = chunks_out , blocks = blocks_out )
100+ if len (selected_paths ) == before :
101+ selected_paths .append ("chunked" )
93102 for _ in range (repeats ):
94103 before = len (selected_paths )
95104 t0 = time .perf_counter ()
@@ -113,6 +122,8 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
113122
114123 best = min (times )
115124 median = statistics .median (times )
125+ selected_path = selected_paths [0 ] if selected_paths and len (set (selected_paths )) == 1 else "mixed"
126+ reported_block_backend = selected_block_backend if selected_path != "chunked" else None
116127 return {
117128 "label" : label ,
118129 "mode" : mode ,
@@ -123,29 +134,76 @@ def wrapped_set_pref_matmul(self, inputs, fp_accuracy):
123134 "gflops_median" : expected_gflops (shape_a , shape_b , median ),
124135 "correct" : True ,
125136 "configured_block_backend" : block_backend ,
126- "selected_block_backend" : selected_block_backend ,
137+ "selected_block_backend" : reported_block_backend ,
127138 "selected_paths" : selected_paths ,
128- "selected_path" : selected_paths [0 ] if selected_paths and len (set (selected_paths )) == 1 else "mixed" ,
139+ "selected_path" : selected_path ,
140+ }
141+
142+
143+ def run_numpy_case (
144+ warmup : int ,
145+ repeats : int ,
146+ shape_a : tuple [int , ...],
147+ shape_b : tuple [int , ...],
148+ dtype : np .dtype ,
149+ chunks_a : tuple [int , ...] | None ,
150+ chunks_b : tuple [int , ...] | None ,
151+ blocks_a : tuple [int , ...] | None ,
152+ blocks_b : tuple [int , ...] | None ,
153+ ):
154+ _ , _ , a_np , b_np = build_arrays (shape_a , shape_b , dtype , chunks_a , chunks_b , blocks_a , blocks_b )
155+ times = []
156+ result = None
157+ for _ in range (warmup ):
158+ with warnings .catch_warnings ():
159+ warnings .simplefilter ("ignore" , RuntimeWarning )
160+ result = np .matmul (a_np , b_np )
161+ for _ in range (repeats ):
162+ t0 = time .perf_counter ()
163+ with warnings .catch_warnings ():
164+ warnings .simplefilter ("ignore" , RuntimeWarning )
165+ result = np .matmul (a_np , b_np )
166+ times .append (time .perf_counter () - t0 )
167+
168+ if result is None :
169+ raise RuntimeError ("numpy.matmul did not produce a result" )
170+
171+ best = min (times )
172+ median = statistics .median (times )
173+ return {
174+ "label" : "numpy" ,
175+ "mode" : "numpy" ,
176+ "times_s" : times ,
177+ "best_s" : best ,
178+ "median_s" : median ,
179+ "gflops_best" : expected_gflops (shape_a , shape_b , best ),
180+ "gflops_median" : expected_gflops (shape_a , shape_b , median ),
181+ "correct" : True ,
182+ "configured_block_backend" : None ,
183+ "selected_block_backend" : None ,
184+ "selected_paths" : ["numpy" ] * repeats ,
185+ "selected_path" : "numpy" ,
129186 }
130187
131188
132189def main () -> None :
133190 parser = argparse .ArgumentParser (description = "Compare chunked and fast blosc2.matmul paths." )
134- parser .add_argument ("--shape-a" , default = "400,400 " , help = "Comma-separated shape for A." )
135- parser .add_argument ("--shape-b" , default = "400,400 " , help = "Comma-separated shape for B." )
191+ parser .add_argument ("--shape-a" , default = "2000,2000 " , help = "Comma-separated shape for A." )
192+ parser .add_argument ("--shape-b" , default = "2000,2000 " , help = "Comma-separated shape for B." )
136193 parser .add_argument ("--dtype" , default = "float32" , choices = ["float32" , "float64" , "int32" , "int64" ])
137- parser .add_argument ("--chunks-a" , default = "200,200 " , help = "Comma-separated chunk shape for A." )
138- parser .add_argument ("--chunks-b" , default = "200,200 " , help = "Comma-separated chunk shape for B." )
194+ parser .add_argument ("--chunks-a" , default = "500,500 " , help = "Comma-separated chunk shape for A." )
195+ parser .add_argument ("--chunks-b" , default = "500,500 " , help = "Comma-separated chunk shape for B." )
139196 parser .add_argument ("--blocks-a" , default = "100,100" , help = "Comma-separated block shape for A." )
140197 parser .add_argument ("--blocks-b" , default = "100,100" , help = "Comma-separated block shape for B." )
141- parser .add_argument ("--chunks-out" , default = "200,200 " , help = "Comma-separated chunk shape for output." )
198+ parser .add_argument ("--chunks-out" , default = "500,500 " , help = "Comma-separated chunk shape for output." )
142199 parser .add_argument ("--blocks-out" , default = "100,100" , help = "Comma-separated block shape for output." )
143- parser .add_argument ("--repeats" , type = int , default = 250 )
200+ parser .add_argument ("--warmup" , type = int , default = 2 )
201+ parser .add_argument ("--repeats" , type = int , default = 1 )
144202 parser .add_argument ("--modes" , nargs = "+" , default = ["chunked" , "fast" , "auto" ], choices = ["chunked" , "fast" , "auto" ])
145203 parser .add_argument (
146204 "--block-backend" ,
147205 default = "auto" ,
148- choices = ["auto" , "naive" , "accelerate" ],
206+ choices = ["auto" , "naive" , "accelerate" , "cblas" ],
149207 help = "Kernel backend for the fast matmul block path." ,
150208 )
151209 parser .add_argument ("--json" , action = "store_true" , help = "Emit full JSON instead of a compact text summary." )
@@ -161,13 +219,27 @@ def main() -> None:
161219 blocks_out = parse_int_tuple (args .blocks_out ) if args .blocks_out else None
162220 dtype = np .dtype (args .dtype )
163221
222+ print ("Matmul path comparison" )
223+ print (f" A shape: { shape_a } " )
224+ print (f" B shape: { shape_b } " )
225+ print (f" dtype: { dtype } " )
226+ print (f" chunks A/B/out: { chunks_a } / { chunks_b } / { chunks_out } " )
227+ print (f" blocks A/B/out: { blocks_a } / { blocks_b } / { blocks_out } " )
228+ print (f" warmup: { args .warmup } " )
229+ print (f" repeats: { args .repeats } " )
230+ print (f" fast block backend: { args .block_backend } " )
231+ print (f" matmul library: { blosc2 .get_matmul_library ()} " )
232+ print ()
233+ print ("Results:" )
234+
164235 results = []
165236 for mode in args .modes :
166237 results .append (
167238 run_case (
168239 mode ,
169240 mode ,
170241 args .block_backend ,
242+ args .warmup ,
171243 args .repeats ,
172244 shape_a ,
173245 shape_b ,
@@ -186,6 +258,7 @@ def main() -> None:
186258 "fast-naive" ,
187259 "fast" ,
188260 "naive" ,
261+ args .warmup ,
189262 args .repeats ,
190263 shape_a ,
191264 shape_b ,
@@ -202,6 +275,20 @@ def main() -> None:
202275 ):
203276 results .append (fast_naive )
204277
278+ results .append (
279+ run_numpy_case (
280+ args .warmup ,
281+ args .repeats ,
282+ shape_a ,
283+ shape_b ,
284+ dtype ,
285+ chunks_a ,
286+ chunks_b ,
287+ blocks_a ,
288+ blocks_b ,
289+ )
290+ )
291+
205292 summary = {
206293 "shape_a" : shape_a ,
207294 "shape_b" : shape_b ,
@@ -223,31 +310,32 @@ def main() -> None:
223310 summary ["speedup_fast_naive_vs_chunked" ] = best_by_label ["chunked" ] / best_by_label ["fast-naive" ]
224311 if "fast" in best_by_label and "fast-naive" in best_by_label :
225312 summary ["speedup_fast_vs_fast_naive" ] = best_by_label ["fast-naive" ] / best_by_label ["fast" ]
313+ if "numpy" in best_by_label and "fast" in best_by_label :
314+ summary ["speedup_fast_vs_numpy" ] = best_by_label ["numpy" ] / best_by_label ["fast" ]
315+ if "numpy" in best_by_label and "auto" in best_by_label :
316+ summary ["speedup_auto_vs_numpy" ] = best_by_label ["numpy" ] / best_by_label ["auto" ]
226317
227318 if args .json :
228319 print (json .dumps (summary , indent = 2 , sort_keys = True ))
229320 return
230321
231- print ("Matmul path comparison" )
232- print (f" A shape: { shape_a } " )
233- print (f" B shape: { shape_b } " )
234- print (f" dtype: { dtype } " )
235- print (f" chunks A/B/out: { chunks_a } / { chunks_b } / { chunks_out } " )
236- print (f" blocks A/B/out: { blocks_a } / { blocks_b } / { blocks_out } " )
237- print (f" repeats: { args .repeats } " )
238- print (f" fast block backend: { args .block_backend } " )
239- display_order = ["chunked" , "fast-naive" , "fast" , "auto" ]
322+ display_order = ["chunked" , "fast-naive" , "fast" , "auto" , "numpy" ]
240323 ordered_results = sorted (results , key = lambda item : display_order .index (item ["label" ]) if item ["label" ] in display_order else len (display_order ))
241324
242325 for item in ordered_results :
243326 gflops_best = "-" if item ["gflops_best" ] is None else f"{ item ['gflops_best' ]:.3f} "
327+ if item ["label" ] == "numpy" :
328+ backend_info = f"library={ blosc2 .get_matmul_library ()} "
329+ else :
330+ block_backend = item ["selected_block_backend" ] if item ["selected_block_backend" ] is not None else "-"
331+ backend_info = f"block_backend={ block_backend } "
244332 print (
245333 f"{ item ['label' ]:>10} : "
246334 f"best={ item ['best_s' ]:.6f} s "
247335 f"median={ item ['median_s' ]:.6f} s "
248336 f"gflops={ gflops_best } "
249337 f"path={ item ['selected_path' ]} "
250- f"block_backend= { item [ 'selected_block_backend' ] } "
338+ f"{ backend_info } "
251339 f"correct={ item ['correct' ]} "
252340 )
253341 if "speedup_fast_vs_chunked" in summary :
@@ -256,6 +344,10 @@ def main() -> None:
256344 print (f"Speedup fast-naive vs chunked: { summary ['speedup_fast_naive_vs_chunked' ]:.3f} x" )
257345 if "speedup_fast_vs_fast_naive" in summary :
258346 print (f"Speedup fast vs fast-naive: { summary ['speedup_fast_vs_fast_naive' ]:.3f} x" )
347+ if "speedup_fast_vs_numpy" in summary :
348+ print (f"Speedup fast vs numpy: { summary ['speedup_fast_vs_numpy' ]:.3f} x" )
349+ if "speedup_auto_vs_numpy" in summary :
350+ print (f"Speedup auto vs numpy: { summary ['speedup_auto_vs_numpy' ]:.3f} x" )
259351
260352
261353if __name__ == "__main__" :
0 commit comments