diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml index bebaf4b8e..ecc3a53dd 100644 --- a/diskann-benchmark/Cargo.toml +++ b/diskann-benchmark/Cargo.toml @@ -63,6 +63,9 @@ scalar-quantization = [] # Enable minmax-quantization based algorithms minmax-quantization = [] +# Enable multi-vector MaxSim distance benchmarks +multi-vector = [] + # Enable Disk Index benchmarks disk-index = [ "diskann-disk/perf_test", diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json new file mode 100644 index 000000000..7a4e59539 --- /dev/null +++ b/diskann-benchmark/example/multi-vector.json @@ -0,0 +1,47 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "scalar", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 } + ] + } + } + ] +} diff --git a/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json new file mode 100644 index 000000000..8d5997199 --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json @@ -0,0 +1,16 @@ +{ + "checks": [ + { + "input": { + "type": "multi-vector-op", + "content": {} + }, + "tolerance": { + "type": "multi-vector-tolerance", + "content": { + "min_time_regression": 0.05 + } + } + } + ] +} diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json new file mode 100644 index 000000000..57922fe10 --- /dev/null +++ b/diskann-benchmark/perf_test_inputs/multi-vector.json @@ -0,0 +1,149 @@ +{ + "search_directories": [], + "jobs": [ + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "auto", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "scalar", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "x86-64-v3", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "x86-64-v4", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float32", + "arch": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "x86-64-v3", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "x86-64-v4", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + }, + { + "type": "multi-vector-op", + "content": { + "element_type": "float16", + "arch": "reference", + "runs": [ + { "num_query_vectors": 8, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 }, + { "num_query_vectors": 16, "num_doc_vectors": 64, "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 20, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 50, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2, "num_measurements": 50 }, + { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 }, + { "num_query_vectors": 32, "num_doc_vectors": 32, "dim": 512, "loops_per_measurement": 50, "num_measurements": 50 } + ] + } + } + ] +} diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs index 24fe91d7e..0d1c61345 100644 --- a/diskann-benchmark/src/backend/mod.rs +++ b/diskann-benchmark/src/backend/mod.rs @@ -7,10 +7,12 @@ mod disk_index; mod exhaustive; mod filters; mod index; +mod multi_vector; pub(crate) fn register_benchmarks(registry: &mut diskann_benchmark_runner::registry::Benchmarks) { exhaustive::register_benchmarks(registry); disk_index::register_benchmarks(registry); index::register_benchmarks(registry); filters::register_benchmarks(registry); + multi_vector::register_benchmarks(registry); } diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs new file mode 100644 index 000000000..2f83eb22f --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/driver.rs @@ -0,0 +1,279 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Shared benchmark infrastructure for multi-vector kernels. +//! +//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result +//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object +//! [`Distance`] boundary that both library and experimental kernels go +//! through. None of the contents are kernel-aware. + +use diskann_benchmark_runner::utils::{ + fmt::Table, num::relative_change, percentiles, MicroSeconds, +}; +use diskann_quantization::multi_vector::distance::QueryMatRef; +use diskann_quantization::multi_vector::{Mat, MatRef, MaxSim, QueryComputer, Standard}; +use diskann_vector::distance::InnerProduct; +use diskann_vector::{DistanceFunctionMut, PureDistanceFunction}; +use rand::{ + distr::{Distribution, StandardUniform}, + rngs::StdRng, + SeedableRng, +}; +use serde::{Deserialize, Serialize}; + +use crate::inputs::multi_vector::{MultiVectorTolerance, Run}; + +/////////////////// +// Data fixtures // +/////////////////// + +/// Random query / doc fixture for a single benchmark run. +pub(super) struct Data { + pub(super) queries: Mat>, + pub(super) docs: Mat>, +} + +impl Data +where + StandardUniform: Distribution, +{ + pub(super) fn new(run: &Run) -> Self { + let mut rng = StdRng::seed_from_u64(0x12345); + let queries = Mat::from_fn( + Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(), + || StandardUniform.sample(&mut rng), + ); + let docs = Mat::from_fn( + Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(), + || StandardUniform.sample(&mut rng), + ); + Self { queries, docs } + } +} + +////////////////////// +// Distance kernels // +////////////////////// + +/// Object-safe abstraction over a per-shape distance executor. +/// +/// `OptimizedDistance` wraps any [`QueryComputer`] — library-shipped +/// arch-pinned ones (via `from_arch`) AND experimental ones (via +/// `from_dyn`) — so the driver's hot loop dispatches through one vtable +/// hop regardless of which kernel produced the computer. +/// `ReferenceDistance` is the only path that doesn't go through +/// `QueryComputer` (it uses the `MaxSim` fallback directly). +pub(super) trait Distance { + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); +} + +/// Distance executor wrapping a [`QueryComputer`] — covers all arch-pinned, +/// auto-dispatched, and experimental kernels. +pub(super) struct OptimizedDistance(pub(super) QueryComputer); + +impl Distance for OptimizedDistance { + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + self.0.max_sim(doc, scores); + } +} + +/// Distance executor driving the [`MaxSim`] fallback path. +pub(super) struct ReferenceDistance<'a, T: Copy>(pub(super) QueryMatRef<'a, Standard>); + +impl Distance for ReferenceDistance<'_, T> +where + InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>, +{ + fn max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + // `MaxSim::new` is a non-empty check + pointer wrap, free per iteration. + let mut max_sim = MaxSim::new(scores).unwrap(); + let _ = max_sim.evaluate(self.0, doc); + } +} + +////////////////////// +// Timing harness // +////////////////////// + +fn run_loops(run: &Run, mut body: F) -> RunResult +where + F: FnMut(), +{ + let mut latencies = Vec::with_capacity(run.num_measurements.get()); + + for _ in 0..run.num_measurements.get() { + let start = std::time::Instant::now(); + for _ in 0..run.loops_per_measurement.get() { + body(); + } + latencies.push(start.elapsed().into()); + } + + let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: run.clone(), + latencies, + percentiles, + } +} + +/// Shared loop nest. The trait-object dispatch happens once per outer iteration +/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the +/// vtable hop is in the noise. +pub(super) fn run_with_distance( + run: &Run, + doc: MatRef<'_, Standard>, + dist: &dyn Distance, +) -> RunResult { + let mut scores = vec![0.0f32; run.num_query_vectors.get()]; + run_loops(run, || { + dist.max_sim(doc, &mut scores); + std::hint::black_box(&mut scores); + }) +} + +////////////////////// +// Result types // +////////////////////// + +#[derive(Debug, Clone, Copy)] +pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T); + +impl std::ops::Deref for DisplayWrapper<'_, T> { + type Target = T; + fn deref(&self) -> &T { + self.0 + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub(super) struct RunResult { + /// The configuration for this run. + pub(super) run: Run, + /// Per-measurement latencies (over `loops_per_measurement` calls). + pub(super) latencies: Vec, + /// Latency percentiles. + pub(super) percentiles: percentiles::Percentiles, +} + +impl RunResult { + pub(super) fn computations_per_latency(&self) -> usize { + self.run.num_query_vectors.get() + * self.run.num_doc_vectors.get() + * self.run.loops_per_measurement.get() + } +} + +impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.is_empty() { + return Ok(()); + } + + writeln!( + f, + "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)" + )?; + + let header = [ + "Q", + "D", + "Dim", + "Min Time (ns/IP @ Dim)", + "Mean Time (ns/IP @ Dim)", + "Loops", + "Measurements", + ]; + + let mut table = Table::new(header, self.len()); + + self.iter().enumerate().for_each(|(row, r)| { + let mut row = table.row(row); + + let min_latency = r + .latencies + .iter() + .min() + .copied() + .unwrap_or(MicroSeconds::new(u64::MAX)); + let mean_latency = r.percentiles.mean; + + let computations_per_latency = r.computations_per_latency() as f64; + let min_time = min_latency.as_f64() / computations_per_latency * 1000.0; + let mean_time = mean_latency / computations_per_latency * 1000.0; + + row.insert(r.run.num_query_vectors, 0); + row.insert(r.run.num_doc_vectors, 1); + row.insert(r.run.dim, 2); + row.insert(format!("{:.3}", min_time), 3); + row.insert(format!("{:.3}", mean_time), 4); + row.insert(r.run.loops_per_measurement, 5); + row.insert(r.run.num_measurements, 6); + }); + + table.fmt(f) + } +} + +////////////////////// +// Regression Check // +////////////////////// + +/// Per-run comparison result showing before/after percentile differences. +#[derive(Debug, Serialize)] +pub(super) struct Comparison { + pub(super) run: Run, + pub(super) tolerance: MultiVectorTolerance, + pub(super) before_min: f64, + pub(super) after_min: f64, +} + +/// Aggregated result of the regression check across all runs. +#[derive(Debug, Serialize)] +pub(super) struct CheckResult { + pub(super) checks: Vec, +} + +impl std::fmt::Display for CheckResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let header = [ + "Q", + "D", + "Dim", + "Min Before (ns/IP @ Dim)", + "Min After (ns/IP @ Dim)", + "Change (%)", + "Remark", + ]; + + let mut table = Table::new(header, self.checks.len()); + + for (i, c) in self.checks.iter().enumerate() { + let mut row = table.row(i); + let change = relative_change(c.before_min, c.after_min); + + row.insert(c.run.num_query_vectors, 0); + row.insert(c.run.num_doc_vectors, 1); + row.insert(c.run.dim, 2); + row.insert(format!("{:.3}", c.before_min), 3); + row.insert(format!("{:.3}", c.after_min), 4); + match change { + Ok(change) => { + row.insert(format!("{:.3} %", change * 100.0), 5); + if change > c.tolerance.min_time_regression.get() { + row.insert("FAIL", 6); + } + } + Err(err) => { + row.insert("invalid", 5); + row.insert(err, 6); + } + } + } + + table.fmt(f) + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs new file mode 100644 index 000000000..b0e106fc4 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs @@ -0,0 +1,23 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Researcher-authored experimental multi-vector kernels. +//! +//! See [`template`] for the full kernel-author workflow (writing a `Kernel` +//! impl, adapting it via `DynQueryComputer`, wiring up dispatch and +//! registration, and validating under Miri). +//! +//! New experimental kernels live in their own module file in this directory. +//! Their registration goes in [`register`] below. + +use diskann_benchmark_runner::registry::Benchmarks; + +mod template; + +pub(super) fn register(_benchmarks: &mut Benchmarks) { + // No experimental kernels registered by default. + // Add `benchmarks.register_regression(...)` calls here when authoring + // new experimental kernels. +} diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs new file mode 100644 index 000000000..64eeb3e00 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs @@ -0,0 +1,256 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! **Template for an experimental multi-vector kernel.** +//! +//! Copy this file (e.g. to `v4_wide.rs`), rename `Template*`, change the +//! `Kernel` impl to your target ISA, and add an `Arch` variant + a +//! `register_regression` call to wire it up. +//! +//! # The 5-step workflow +//! +//! 1. **Add an [`Arch`](crate::inputs::multi_vector::Arch) variant** for your +//! experimental kernel (e.g. `X86_64_V4_Wide`). The `#[non_exhaustive]` +//! attribute on `Arch` makes this a non-breaking addition. +//! 2. **Author the micro-kernel.** Implement +//! [`Kernel`](diskann_quantization::multi_vector::distance::kernels::Kernel) +//! on your struct (`unsafe impl Kernel` etc.), filling in +//! `full_panel` and `partial_panel` with your SIMD intrinsics. +//! 3. **Author the adapter.** Implement +//! [`DynQueryComputer`](diskann_quantization::multi_vector::distance::DynQueryComputer) +//! on a struct that owns the prepared query data; in `compute_max_sim`, +//! call +//! [`tiled_reduce`](diskann_quantization::multi_vector::distance::kernels::tiled_reduce) +//! with your kernel. +//! 4. **Add a marker + `DispatchRule`.** Mirror the pattern in +//! `library_kernels.rs` (e.g. `match_arch_x86_64!`) for your new variant. +//! 5. **Add a `RunBenchmark` impl + `register_regression` call.** Use +//! `Kernel::::new()` as the registered benchmark entry. +//! +//! Then validate under Miri before treating the kernel as correct — see the +//! section below. +//! +//! # Validating under Miri (REQUIRED) +//! +//! Experimental kernels rely on `unsafe fn full_panel` / `partial_panel` +//! with raw-pointer arithmetic. Pointer provenance, alignment, and +//! out-of-bounds bugs are easy to introduce and hard to catch by +//! inspection. **Run your kernel under Miri before assuming it's correct.** +//! +//! Rules: +//! +//! - Inside your `#[cfg(test)]` module, construct arch tokens via the +//! Miri-friendly variants: `Scalar::new()` (always Miri-safe) or +//! `V4::new_checked_miri()` (returns a token unconditionally under +//! `cfg(miri)` using AVX-512 emulation, so tests run even when Miri +//! can't do real CPU detection). `V3` and `Neon` only expose +//! `new_checked()` today — if you need them under Miri, follow +//! `V4::new_checked_miri()`'s pattern in `diskann-wide`. +//! - Any SIMD intrinsic Miri doesn't support must have a scalar fallback +//! gated by `#[cfg(miri)]`. +//! - Add at least one small-shape correctness test that runs your kernel +//! against a naive reference and is Miri-friendly. +//! - Run: `cargo +nightly miri test -p diskann-benchmark --features multi-vector +//! backend::multi_vector::experimental::`. Reduce +//! test-sweep size under Miri with `if cfg!(miri) { small } else { full }` +//! (see this file's test for the pattern). +//! +//! Miri won't catch performance bugs, but it'll catch UB — and UB in an +//! experimental kernel breaks the benchmark binary, not the kernel you're +//! trying to measure. +//! +//! # This template +//! +//! This file defines `TemplateKernel: Kernel` (uses `Scalar` so the +//! template is host-portable + Miri-friendly) and a `TemplateComputer` +//! adapter that pipes it through `tiled_reduce`. It is **not registered** as +//! a benchmark entry — see step 5 in the workflow. The included +//! `#[cfg(test)]` `template_matches_pinned_scalar` test exercises the API +//! surface end-to-end so this file catches public-API drift even though it +//! isn't wired into the benchmark dispatcher. + +#![allow(dead_code)] + +use diskann_quantization::multi_vector::distance::{ + kernels::{layouts, tiled_reduce, Kernel, TileBudget}, + DynQueryComputer, +}; +use diskann_quantization::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard}; +use diskann_wide::arch::Scalar; + +/// Step 2: the micro-kernel struct. Rename and implement for your target arch. +pub(super) struct TemplateKernel; + +// SAFETY: `full_panel` / `partial_panel` only access `A_PANEL * k` / +// `B_PANEL * k` source elements and write `A_PANEL` destination f32s, +// matching `Kernel`'s safety contract. The simple scalar +// computation here is Miri-clean. +unsafe impl Kernel for TemplateKernel { + type Left = layouts::BlockTransposedLayout; + type Right = layouts::RowMajor; + const A_PANEL: usize = 8; + const B_PANEL: usize = 2; + + unsafe fn full_panel(_arch: Scalar, a: *const f32, b: *const f32, k: usize, r: *mut f32) { + // SAFETY: a covers A_PANEL * k contiguous block-transposed f32s, + // b covers B_PANEL * k contiguous row-major f32s, r covers A_PANEL f32s. + unsafe { panel::<8, 2>(a, b, k, r) } + } + + unsafe fn partial_panel( + _arch: Scalar, + remainder: usize, + a: *const f32, + b: *const f32, + k: usize, + r: *mut f32, + ) { + debug_assert!(remainder == 1); + // SAFETY: as full_panel but with `b` covering `remainder * k` f32s. + unsafe { panel::<8, 1>(a, b, k, r) } + } +} + +/// Replace this with your SIMD intrinsics. The block-transposed A layout +/// stores `A_ROWS` contiguous f32s per dimension index `i`, so the q-th +/// query row at dimension i lives at `a[i * A_ROWS + q]`. The row-major B +/// layout stores doc d's k-th element at `b[d * k_dim + k]`. The scratch +/// `r` accumulates max IP per query row (library convention; the +/// `QueryComputer` veneer negates at the end). +/// +/// # Safety +/// - `a` covers `A_ROWS * k` block-transposed f32s. +/// - `b` covers `B_ROWS * k` row-major f32s. +/// - `r` covers `A_ROWS` writable f32s. +unsafe fn panel( + a: *const f32, + b: *const f32, + k: usize, + r: *mut f32, +) { + for q in 0..A_ROWS { + // SAFETY: q < A_ROWS. + let mut best = unsafe { *r.add(q) }; + + for d in 0..B_ROWS { + let mut ip: f32 = 0.0; + for i in 0..k { + // SAFETY: i < k, q < A_ROWS. + let a_val = unsafe { *a.add(i * A_ROWS + q) }; + // SAFETY: d < B_ROWS, b covers B_ROWS rows of k f32s each. + let b_val = unsafe { *b.add(d * k + i) }; + ip += a_val * b_val; + } + best = best.max(ip); + } + + // SAFETY: q < A_ROWS. + unsafe { *r.add(q) = best }; + } +} + +/// Step 3: the `DynQueryComputer` adapter. Owns the prepared query data +/// and routes `compute_max_sim` through `tiled_reduce` with the kernel. +#[derive(Debug)] +pub(super) struct TemplateComputer { + arch: Scalar, + prepared: BlockTransposed, +} + +impl TemplateComputer { + pub(super) fn new(query: MatRef<'_, Standard>) -> Self { + let prepared = BlockTransposed::::from_matrix_view(query.as_matrix_view()); + Self { + arch: Scalar::new(), + prepared, + } + } +} + +impl DynQueryComputer for TemplateComputer { + fn nrows(&self) -> usize { + self.prepared.nrows() + } + + fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]) { + let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()]; + let prepared_ref: BlockTransposedRef<'_, f32, 8> = self.prepared.as_view(); + let ca = as layouts::DescribeLayout>::layout(&prepared_ref); + let cb = > as layouts::DescribeLayout>::layout(&doc); + + // SAFETY: prepared.as_ptr() covers padded_nrows * ncols block-transposed + // f32s; doc.as_slice() covers num_vectors * vector_dim row-major f32s; + // scratch length == padded_nrows; padded_nrows is a multiple of + // A_PANEL=8 by BlockTransposed construction. + unsafe { + tiled_reduce::( + self.arch, + &ca, + &cb, + self.prepared.as_ptr(), + self.prepared.padded_nrows(), + doc.as_slice().as_ptr(), + doc.num_vectors(), + doc.vector_dim(), + &mut scratch, + TileBudget::default(), + ); + } + + for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) { + *dst = -src; + } + } +} + +#[cfg(test)] +mod tests { + //! Miri-friendly correctness test for the template kernel. + //! + //! Validates that the template's adapter machinery produces the same + //! per-row scores as `QueryComputer::from_arch(Scalar)`. Iteration + //! count is reduced under Miri so `cargo +nightly miri test` finishes + //! in seconds, not minutes. + use super::*; + use diskann_quantization::multi_vector::QueryComputer; + + fn make_data(nrows: usize, ncols: usize, shift: usize) -> Vec { + (0..nrows * ncols) + .map(|v| ((v + shift) % ncols) as f32) + .collect() + } + + #[test] + fn template_matches_pinned_scalar() { + let cases: &[(usize, usize, usize)] = if cfg!(miri) { + // Single small case under Miri to keep runtime reasonable. + &[(3, 4, 8)] + } else { + &[(1, 1, 4), (3, 5, 8), (8, 4, 16), (10, 6, 32)] + }; + + for &(nq, nd, dim) in cases { + let qd = make_data(nq, dim, dim / 2); + let dd = make_data(nd, dim, dim); + let query = MatRef::new(Standard::::new(nq, dim).unwrap(), &qd).unwrap(); + let doc = MatRef::new(Standard::::new(nd, dim).unwrap(), &dd).unwrap(); + + let pinned = QueryComputer::::from_arch(query, Scalar::new()); + let template = QueryComputer::::from_dyn(Box::new(TemplateComputer::new(query))); + + let mut pinned_scores = vec![0.0f32; nq]; + let mut template_scores = vec![0.0f32; nq]; + pinned.max_sim(doc, &mut pinned_scores); + template.max_sim(doc, &mut template_scores); + + for (i, (p, t)) in pinned_scores.iter().zip(template_scores.iter()).enumerate() { + assert!( + (p - t).abs() < 1e-10, + "shape ({nq},{nd},{dim}) row {i}: pinned={p} template={t}", + ); + } + } + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/library_kernels.rs b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs new file mode 100644 index 000000000..56d6c0db1 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs @@ -0,0 +1,510 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Library kernel registrations and arch-dispatch machinery. +//! +//! Mirrors the structure of `diskann-benchmark-simd`: a `Kernel` +//! PhantomData carrier carries the (arch × element type) pair through the +//! benchmark registry, [`DispatchRule`] maps the JSON-facing `Arch` +//! enum to a concrete arch token, and the `stamp!` / `match_arch!` macros +//! generate the repetitive `RunBenchmark` / `DispatchRule` impls. +//! +//! Library kernels registered here: +//! - `multi-vector-op-{f32,f16}-auto` — `QueryComputer::new` (auto-dispatch) +//! - `multi-vector-op-{f32,f16}-scalar` — `from_arch(Scalar)` +//! - `multi-vector-op-{f32,f16}-x86_64_V3` — `from_arch(V3)` (x86_64 only) +//! - `multi-vector-op-{f32,f16}-x86_64_V4` — `from_arch(V4)` (x86_64 only) +//! - `multi-vector-op-{f32,f16}-aarch64_neon` — `from_arch(Neon)` (aarch64 only) +//! - `multi-vector-op-{f32,f16}-reference` — `MaxSim` fallback + +use std::io::Write; +use std::marker::PhantomData; + +use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + dispatcher::{Description, DispatchRule, FailureScore, MatchScore}, + utils::{datatype, num::relative_change}, + Benchmark, Checkpoint, Output, +}; +use diskann_quantization::multi_vector::{MatRef, QueryComputer, Standard}; +use diskann_vector::distance::InnerProduct; +use diskann_vector::PureDistanceFunction; +#[cfg(target_arch = "aarch64")] +use diskann_wide::arch::aarch64::Neon; +#[cfg(target_arch = "x86_64")] +use diskann_wide::arch::x86_64::{V3, V4}; +use diskann_wide::arch::Scalar; +use diskann_wide::Architecture; +use rand::distr::{Distribution, StandardUniform}; + +use super::driver::{ + run_with_distance, CheckResult, Comparison, Data, DisplayWrapper, OptimizedDistance, + ReferenceDistance, RunResult, +}; +use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance}; + +/// PhantomData carrier for one (arch, element-type) entry in the benchmark +/// registry. The arch parameter `A` is either a real arch token (`Scalar`, +/// `V3`, `V4`, `Neon`) or one of the marker types [`Auto`] / [`Reference`]. +pub(super) struct Kernel { + _type: PhantomData<(A, T)>, +} + +impl Kernel { + pub(super) fn new() -> Self { + Self { _type: PhantomData } + } +} + +/// Marker for the auto-dispatched (CPU-detected) kernel — `QueryComputer::new`. +#[derive(Debug, Clone, Copy)] +pub(super) struct Auto; + +/// Marker for the reference (`MaxSim` fallback) kernel. +#[derive(Debug, Clone, Copy)] +pub(super) struct Reference; + +/// Wrapper around an arch token (real or marker) that implements +/// [`DispatchRule`] for the JSON-facing [`Arch`] enum. +pub(super) struct Identity(pub(super) A); + +/// Returned by `Identity::::convert` when the host CPU doesn't support the +/// requested ISA. The dispatcher converts this into a friendly error message. +#[derive(Debug, Clone, Copy)] +pub(super) struct ArchNotSupported(pub(super) Arch); + +impl std::fmt::Display for ArchNotSupported { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} not supported on this CPU", self.0) + } +} + +impl std::error::Error for ArchNotSupported {} + +////////////////////// +// Dispatch rules // +////////////////////// + +/// Generates a [`DispatchRule`] for a real arch token. `try_match` returns: +/// - `Ok(MatchScore(0))` when the input names this arch AND the host CPU supports it +/// - `Err(FailureScore(0))` when the input names this arch but the CPU doesn't support it +/// (this surfaces in the dispatcher's near-miss diagnostic) +/// - `Err(FailureScore(1))` when the input names a different arch +macro_rules! match_arch_x86_64 { + ($arch:path, $enum:ident) => { + #[cfg(target_arch = "x86_64")] + impl DispatchRule for Identity<$arch> { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from != Arch::$enum { + return Err(FailureScore(1)); + } + if <$arch>::new_checked().is_some() { + Ok(MatchScore(0)) + } else { + Err(FailureScore(0)) + } + } + fn convert(from: Arch) -> Result { + <$arch>::new_checked() + .ok_or(ArchNotSupported(from)) + .map(Identity) + } + } + }; +} + +match_arch_x86_64!(V3, X86_64_V3); +match_arch_x86_64!(V4, X86_64_V4); + +#[cfg(target_arch = "aarch64")] +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from != Arch::Neon { + return Err(FailureScore(1)); + } + if Neon::new_checked().is_some() { + Ok(MatchScore(0)) + } else { + Err(FailureScore(0)) + } + } + fn convert(from: Arch) -> Result { + Neon::new_checked() + .ok_or(ArchNotSupported(from)) + .map(Identity) + } +} + +// Scalar is always available; no CPU check needed. +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from == Arch::Scalar { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + fn convert(_from: Arch) -> Result { + Ok(Identity(Scalar::new())) + } +} + +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from == Arch::Auto { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + fn convert(_from: Arch) -> Result { + Ok(Identity(Auto)) + } +} + +impl DispatchRule for Identity { + type Error = ArchNotSupported; + fn try_match(from: &Arch) -> Result { + if *from == Arch::Reference { + Ok(MatchScore(0)) + } else { + Err(FailureScore(1)) + } + } + fn convert(_from: Arch) -> Result { + Ok(Identity(Reference)) + } +} + +////////////////////// +// Benchmark trait // +////////////////////// + +/// Per-arch run trait. The `stamp!` macro generates impls for real arch tokens; +/// `Auto` and `Reference` get hand-written impls. +pub(super) trait RunBenchmark { + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result>; +} + +impl Benchmark for Kernel +where + datatype::Type: DispatchRule, + Identity: DispatchRule, + Kernel: RunBenchmark, + A: 'static, + T: 'static, +{ + type Input = MultiVectorOp; + type Output = Vec; + + fn try_match(&self, from: &MultiVectorOp) -> Result { + let mut failscore: Option = None; + if datatype::Type::::try_match(&from.element_type).is_err() { + *failscore.get_or_insert(0) += 10; + } + match Identity::::try_match(&from.arch) { + Ok(MatchScore(_)) => (), + Err(FailureScore(score)) => { + *failscore.get_or_insert(0) += score; + } + } + match failscore { + None => Ok(MatchScore(0)), + Some(score) => Err(FailureScore(score)), + } + } + + fn run( + &self, + input: &MultiVectorOp, + _: Checkpoint<'_>, + mut output: &mut dyn Output, + ) -> anyhow::Result { + writeln!(output, "{}", input)?; + let results = self.run_benchmark(input)?; + writeln!(output, "\n\n{}", DisplayWrapper(&*results))?; + Ok(results) + } + + fn description( + &self, + f: &mut std::fmt::Formatter<'_>, + input: Option<&MultiVectorOp>, + ) -> std::fmt::Result { + match input { + None => { + writeln!( + f, + "- Element Type: {}", + Description::>::new() + )?; + writeln!(f, "- Arch: {}", Description::>::new())?; + } + Some(input) => { + if let Err(err) = datatype::Type::::try_match_verbose(&input.element_type) { + writeln!(f, "\n - Mismatched element type: {}", err)?; + } + if Identity::::try_match(&input.arch).is_err() { + writeln!(f, "\n - Wrong or unsupported arch: {}", input.arch)?; + } + } + } + Ok(()) + } +} + +impl Regression for Kernel +where + datatype::Type: DispatchRule, + Identity: DispatchRule, + Kernel: RunBenchmark, + A: 'static, + T: 'static, +{ + type Tolerances = MultiVectorTolerance; + type Pass = CheckResult; + type Fail = CheckResult; + + fn check( + &self, + tolerance: &MultiVectorTolerance, + _input: &MultiVectorOp, + before: &Vec, + after: &Vec, + ) -> anyhow::Result> { + anyhow::ensure!( + before.len() == after.len(), + "before has {} runs but after has {}", + before.len(), + after.len(), + ); + + let mut passed = true; + let checks: Vec = std::iter::zip(before.iter(), after.iter()) + .enumerate() + .map(|(i, (b, a))| { + anyhow::ensure!(b.run == a.run, "run {i} mismatched"); + + let computations_per_latency = b.computations_per_latency() as f64; + let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency; + + let comparison = Comparison { + run: b.run.clone(), + tolerance: *tolerance, + before_min, + after_min, + }; + + match relative_change(before_min, after_min) { + Ok(change) => { + if change > tolerance.min_time_regression.get() { + passed = false; + } + } + Err(_) => passed = false, + }; + + Ok(comparison) + }) + .collect::>>()?; + + let check = CheckResult { checks }; + Ok(if passed { + PassFail::Pass(check) + } else { + PassFail::Fail(check) + }) + } +} + +////////////////////// +// RunBenchmark impls +////////////////////// + +/// Element-type-erasing constructor for [`QueryComputer`]. `QueryComputer`'s +/// `new` / `from_arch` are inherent methods on the concrete `QueryComputer` +/// and `QueryComputer` types, so generic code needs this shim. +pub(super) trait BuildArchQc { + /// Build a `QueryComputer` pinned to the host's auto-dispatched arch. + fn build_auto(query: MatRef<'_, Standard>) -> QueryComputer; +} + +impl BuildArchQc for f32 { + fn build_auto(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } +} + +impl BuildArchQc for half::f16 { + fn build_auto(query: MatRef<'_, Standard>) -> QueryComputer { + QueryComputer::::new(query) + } +} + +/// Per-(arch, T) constructor for `QueryComputer::from_arch`. Same idea as +/// [`BuildArchQc::build_auto`] but pinned to a specific arch token. +pub(super) trait BuildPinnedQc { + fn build_pinned(query: MatRef<'_, Standard>, arch: A) -> QueryComputer; +} + +macro_rules! impl_build_pinned { + ($arch:path, $T:ty) => { + impl BuildPinnedQc<$arch, $T> for $T { + fn build_pinned(query: MatRef<'_, Standard<$T>>, arch: $arch) -> QueryComputer<$T> { + QueryComputer::<$T>::from_arch(query, arch) + } + } + }; +} + +impl_build_pinned!(Scalar, f32); +impl_build_pinned!(Scalar, half::f16); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V3, f32); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V3, half::f16); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V4, f32); +#[cfg(target_arch = "x86_64")] +impl_build_pinned!(V4, half::f16); +#[cfg(target_arch = "aarch64")] +impl_build_pinned!(Neon, f32); +#[cfg(target_arch = "aarch64")] +impl_build_pinned!(Neon, half::f16); + +/// Stamp out `RunBenchmark<$arch>` for `Kernel<$arch, $T>` using +/// `QueryComputer::::from_arch($arch_token)`. +macro_rules! stamp { + ($arch:path, $T:ty) => { + impl RunBenchmark<$arch> for Kernel<$arch, $T> + where + StandardUniform: Distribution<$T>, + $T: BuildPinnedQc<$arch, $T>, + { + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + let arch = Identity::<$arch>::convert(input.arch)?.0; + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::<$T>::new(run); + // `QueryComputer` performs query-side precomputation that is + // intentionally amortized across many `max_sim` calls; + // construct it once per shape, outside the timed loop. + let qc = <$T as BuildPinnedQc<$arch, $T>>::build_pinned( + data.queries.as_view(), + arch, + ); + let dist = OptimizedDistance(qc); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } + } + }; + ($target_arch:literal, $arch:path, $T:ty) => { + #[cfg(target_arch = $target_arch)] + stamp!($arch, $T); + }; +} + +stamp!(Scalar, f32); +stamp!(Scalar, half::f16); +stamp!("x86_64", V3, f32); +stamp!("x86_64", V3, half::f16); +stamp!("x86_64", V4, f32); +stamp!("x86_64", V4, half::f16); +stamp!("aarch64", Neon, f32); +stamp!("aarch64", Neon, half::f16); + +// Auto and Reference get hand-written impls (different construction paths). + +impl RunBenchmark for Kernel +where + T: Copy + 'static + BuildArchQc, + StandardUniform: Distribution, +{ + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let qc = >::build_auto(data.queries.as_view()); + let dist = OptimizedDistance(qc); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } +} + +impl RunBenchmark for Kernel +where + T: Copy + 'static, + StandardUniform: Distribution, + InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>, + for<'a> ReferenceDistance<'a, T>: super::driver::Distance, +{ + fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result> { + let mut results = Vec::with_capacity(input.runs.len()); + for run in input.runs.iter() { + let data = Data::::new(run); + let dist = ReferenceDistance(data.queries.as_view().into()); + results.push(run_with_distance(run, data.docs.as_view(), &dist)); + } + Ok(results) + } +} + +////////////////////// +// Registration // +////////////////////// + +pub(super) fn register(benchmarks: &mut diskann_benchmark_runner::registry::Benchmarks) { + benchmarks.register_regression("multi-vector-op-f32-auto", Kernel::::new()); + benchmarks.register_regression("multi-vector-op-f16-auto", Kernel::::new()); + + benchmarks.register_regression("multi-vector-op-f32-scalar", Kernel::::new()); + benchmarks.register_regression( + "multi-vector-op-f16-scalar", + Kernel::::new(), + ); + + benchmarks.register_regression( + "multi-vector-op-f32-reference", + Kernel::::new(), + ); + benchmarks.register_regression( + "multi-vector-op-f16-reference", + Kernel::::new(), + ); + + #[cfg(target_arch = "x86_64")] + { + benchmarks.register_regression("multi-vector-op-f32-x86_64_V3", Kernel::::new()); + benchmarks.register_regression( + "multi-vector-op-f16-x86_64_V3", + Kernel::::new(), + ); + benchmarks.register_regression("multi-vector-op-f32-x86_64_V4", Kernel::::new()); + benchmarks.register_regression( + "multi-vector-op-f16-x86_64_V4", + Kernel::::new(), + ); + } + + #[cfg(target_arch = "aarch64")] + { + benchmarks.register_regression( + "multi-vector-op-f32-aarch64_neon", + Kernel::::new(), + ); + benchmarks.register_regression( + "multi-vector-op-f16-aarch64_neon", + Kernel::::new(), + ); + } +} diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs new file mode 100644 index 000000000..90426e571 --- /dev/null +++ b/diskann-benchmark/src/backend/multi_vector/mod.rs @@ -0,0 +1,233 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Multi-vector MaxSim distance benchmarks with regression detection. +//! +//! This module is a **kernel-research substrate**, not just a benchmark. It +//! supports two distinct use cases: +//! +//! 1. **Head-to-head ISA (instruction set architecture) comparison.** Library +//! kernels are registered per arch (`scalar`, `x86-64-v3`, `x86-64-v4`, +//! `aarch64-neon`) plus `auto` (CPU-detected) and `reference` (fallback). +//! Pinning to a specific ISA lets you compare e.g. AVX2 vs AVX512 on the +//! same AVX512 host. +//! +//! 2. **Experimental kernel authoring.** External crates and the +//! `experimental/` submodule can author new SIMD micro-kernels by +//! implementing the public `Kernel` trait in +//! `diskann-quantization::multi_vector::distance::kernels`, plug them +//! into the existing cache-aware tile orchestrator (`tiled_reduce`), +//! and slot them into the benchmark via +//! `QueryComputer::from_dyn(Box::new(...))`. +//! +//! # Adding a new experimental kernel +//! +//! See `experimental/template.rs` for the full step-by-step workflow with +//! a worked example. Summary: +//! +//! 1. Add a variant to [`crate::inputs::multi_vector::Arch`]. +//! 2. Implement `Kernel` for your micro-kernel. +//! 3. Implement `DynQueryComputer` for your adapter, calling +//! `tiled_reduce` with your kernel. +//! 4. Add a marker type + `DispatchRule` impl so the new variant +//! routes to your kernel. +//! 5. Add a `RunBenchmark` impl + `register_regression(...)` call +//! in `experimental::register`. +//! +//! **Validate experimental kernels under Miri:** +//! - Construct arch tokens via `Scalar::new()` (Miri-safe) or +//! `V4::new_checked_miri()` (Miri-safe AVX-512 emulation). `V3::new_checked()` +//! and `Neon::new_checked()` don't have `_miri` variants today; if you need +//! them under Miri, follow `V4::new_checked_miri()`'s pattern. +//! - Gate Miri-unsupported intrinsics with `#[cfg(not(miri))]`. +//! - Reduce test-sweep size under `cfg(miri)` to keep runtimes reasonable. + +use diskann_benchmark_runner::registry::Benchmarks; + +cfg_if::cfg_if! { + if #[cfg(feature = "multi-vector")] { + mod driver; + mod experimental; + mod library_kernels; + + pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) { + library_kernels::register(benchmarks); + experimental::register(benchmarks); + } + } else { + crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp); + + pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) { + imp::register("multi-vector-op", benchmarks); + } + } +} + +#[cfg(all(test, feature = "multi-vector"))] +mod tests { + use std::num::NonZeroUsize; + + use diskann_benchmark_runner::{ + benchmark::{PassFail, Regression}, + utils::{ + datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles, + MicroSeconds, + }, + }; + + use super::driver::{CheckResult, Comparison, RunResult}; + use super::library_kernels::{Auto, Kernel}; + use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance, Run}; + + fn tiny_run() -> Run { + Run { + num_query_vectors: NonZeroUsize::new(2).unwrap(), + num_doc_vectors: NonZeroUsize::new(2).unwrap(), + dim: NonZeroUsize::new(4).unwrap(), + loops_per_measurement: NonZeroUsize::new(1).unwrap(), + num_measurements: NonZeroUsize::new(1).unwrap(), + } + } + + fn tiny_op() -> MultiVectorOp { + MultiVectorOp { + element_type: DataType::Float32, + arch: Arch::Auto, + runs: vec![tiny_run()], + } + } + + fn tiny_result(minimum: u64) -> RunResult { + let mut latencies = vec![MicroSeconds::new(minimum)]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + RunResult { + run: tiny_run(), + latencies, + percentiles, + } + } + + fn tolerance(limit: f64) -> MultiVectorTolerance { + MultiVectorTolerance { + min_time_regression: NonNegativeFinite::new(limit).unwrap(), + } + } + + #[test] + fn check_rejects_mismatched_runs() { + let kernel = Kernel::::new(); + + // Build a result whose `run` diverges from `tiny_run()` so the + // regression check's `b.run == a.run` invariant fires. + let mut latencies = vec![MicroSeconds::new(100)]; + let percentiles = compute_percentiles(&mut latencies).unwrap(); + let mismatched_result = RunResult { + run: Run { + num_query_vectors: NonZeroUsize::new(4).unwrap(), + ..tiny_run() + }, + latencies, + percentiles, + }; + + let err = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(100)], + &vec![mismatched_result], + ) + .unwrap_err(); + + assert_eq!(err.to_string(), "run 0 mismatched"); + } + + #[test] + fn check_allows_negative_relative_change() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.0), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(95)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_passes_on_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(105)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Pass(_))); + } + + #[test] + fn check_fails_above_tolerance_boundary() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(100)], + &vec![tiny_result(106)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } + + #[test] + fn check_result_display_includes_failure_details() { + let check = CheckResult { + checks: vec![Comparison { + run: tiny_run(), + tolerance: tolerance(0.05), + before_min: 100.0, + after_min: 106.0, + }], + }; + + let rendered = check.to_string(); + assert!(rendered.contains("Q"), "rendered = {rendered}"); + assert!(rendered.contains("Dim"), "rendered = {rendered}"); + assert!(rendered.contains("100.000"), "rendered = {rendered}"); + assert!(rendered.contains("106.000"), "rendered = {rendered}"); + assert!(rendered.contains("6.000 %"), "rendered = {rendered}"); + assert!(rendered.contains("FAIL"), "rendered = {rendered}"); + } + + /// A "before" value of 0 means the measurement was too fast to obtain a + /// reliable signal, so we *could* be letting a regression through. We + /// require at least a non-zero value. + #[test] + fn zero_values_rejected() { + let kernel = Kernel::::new(); + + let result = kernel + .check( + &tolerance(0.05), + &tiny_op(), + &vec![tiny_result(0)], + &vec![tiny_result(0)], + ) + .unwrap(); + + assert!(matches!(result, PassFail::Fail(_))); + } +} diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs index 856412e2a..414a0b52e 100644 --- a/diskann-benchmark/src/inputs/mod.rs +++ b/diskann-benchmark/src/inputs/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod disk; pub(crate) mod exhaustive; pub(crate) mod filters; pub(crate) mod graph_index; +pub(crate) mod multi_vector; pub(crate) mod save_and_load; pub(crate) fn register_inputs( @@ -16,6 +17,7 @@ pub(crate) fn register_inputs( exhaustive::register_inputs(registry)?; disk::register_inputs(registry)?; filters::register_inputs(registry)?; + multi_vector::register_inputs(registry)?; Ok(()) } diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs new file mode 100644 index 000000000..150d72a8e --- /dev/null +++ b/diskann-benchmark/src/inputs/multi_vector.rs @@ -0,0 +1,187 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +use std::num::NonZeroUsize; + +use diskann_benchmark_runner::{ + utils::{datatype::DataType, num::NonNegativeFinite}, + CheckDeserialization, Checker, +}; +use serde::{Deserialize, Serialize}; + +use crate::inputs::{as_input, Example}; + +////////////// +// Registry // +////////////// + +as_input!(MultiVectorOp); +as_input!(MultiVectorTolerance); + +pub(super) fn register_inputs( + registry: &mut diskann_benchmark_runner::registry::Inputs, +) -> anyhow::Result<()> { + registry.register::()?; + registry.register::()?; + Ok(()) +} + +//////////////// +// Enum types // +//////////////// + +/// Which kernel to benchmark. +/// +/// Mirrors `diskann-benchmark-simd`'s `Arch` enum: kebab-case serialization, +/// one variant per supported ISA plus `Reference` (fallback) and `Auto` +/// (host-portable). Marked `#[non_exhaustive]` so experimental kernels can +/// add variants without breaking JSON configs. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +#[non_exhaustive] +pub(crate) enum Arch { + #[serde(rename = "x86-64-v4")] + #[allow(non_camel_case_types)] + X86_64_V4, + #[serde(rename = "x86-64-v3")] + #[allow(non_camel_case_types)] + X86_64_V3, + Neon, + Scalar, + Reference, + /// Auto-dispatch to the host's best supported arch (calls `QueryComputer::new`). + Auto, +} + +impl std::fmt::Display for Arch { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let st = match self { + Self::X86_64_V4 => "x86-64-v4", + Self::X86_64_V3 => "x86-64-v3", + Self::Neon => "neon", + Self::Scalar => "scalar", + Self::Reference => "reference", + Self::Auto => "auto", + }; + write!(f, "{}", st) + } +} + +/// One benchmark configuration: a single shape measurement. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub(crate) struct Run { + pub(crate) num_query_vectors: NonZeroUsize, + pub(crate) num_doc_vectors: NonZeroUsize, + pub(crate) dim: NonZeroUsize, + pub(crate) loops_per_measurement: NonZeroUsize, + pub(crate) num_measurements: NonZeroUsize, +} + +/////////////////////// +// Multi-Vector Op // +/////////////////////// + +/// A complete multi-vector benchmark job. +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct MultiVectorOp { + pub(crate) element_type: DataType, + pub(crate) arch: Arch, + pub(crate) runs: Vec, +} + +impl MultiVectorOp { + pub(crate) const fn tag() -> &'static str { + "multi-vector-op" + } +} + +impl CheckDeserialization for MultiVectorOp { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Example for MultiVectorOp { + fn example() -> Self { + const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); + const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap(); + const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap(); + const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap(); + + let runs = vec![ + Run { + num_query_vectors: NonZeroUsize::new(32).unwrap(), + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + Run { + num_query_vectors: NonZeroUsize::new(64).unwrap(), + num_doc_vectors: NUM_DOC_VECTORS, + dim: DIM, + loops_per_measurement: LOOPS_PER_MEASUREMENT, + num_measurements: NUM_MEASUREMENTS, + }, + ]; + + Self { + element_type: DataType::Float32, + arch: Arch::Auto, + runs, + } + } +} + +macro_rules! write_field { + ($f:ident, $field:tt, $($expr:tt)*) => { + writeln!($f, "{:>18}: {}", $field, $($expr)*) + } +} + +impl std::fmt::Display for MultiVectorOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Multi-Vector Operation\n")?; + write_field!(f, "tag", Self::tag())?; + write_field!(f, "element type", self.element_type)?; + write_field!(f, "arch", self.arch)?; + write_field!(f, "number of runs", self.runs.len())?; + Ok(()) + } +} + +///////////////////////////// +// Multi-Vector Tolerance // +///////////////////////////// + +/// Tolerance thresholds for multi-vector benchmark regression detection. +/// +/// Each field specifies the maximum allowed relative increase in the corresponding metric. +/// For example, a value of `0.05` means a 5% increase is tolerated. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub(crate) struct MultiVectorTolerance { + pub(crate) min_time_regression: NonNegativeFinite, +} + +impl MultiVectorTolerance { + pub(crate) const fn tag() -> &'static str { + "multi-vector-tolerance" + } +} + +impl CheckDeserialization for MultiVectorTolerance { + fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> { + Ok(()) + } +} + +impl Example for MultiVectorTolerance { + fn example() -> Self { + Self { + min_time_regression: NonNegativeFinite::new(0.05) + .expect("0.05 is a valid non-negative finite"), + } + } +} diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index a35d85427..5f641dd9f 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -776,6 +776,92 @@ mod tests { assert!(!output_path.exists()); } + /////////////////// + // Multi-Vector // + /////////////////// + + #[test] + fn multi_vector_integration() { + let path = example_directory().join("multi-vector.json"); + let tempdir = tempfile::tempdir().unwrap(); + let output_path = tempdir.path().join("output.json"); + assert!(!output_path.exists()); + + let modified_input_path = tempdir.path().join("input.json"); + + let mut raw = value_from_file(&path); + prefix_search_directories(&mut raw, &root_directory()); + save_to_file(&modified_input_path, &raw); + + run_multi_vector_integration(&modified_input_path, &output_path) + } + + #[cfg(feature = "multi-vector")] + fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) { + let command = Commands::Run { + input_file: input_path.to_owned(), + output_file: output_path.to_owned(), + dry_run: false, + allow_debug: true, + }; + + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + + cli.run(&mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + + // Check that the results file is generated. + assert!(output_path.exists()); + } + + #[cfg(not(feature = "multi-vector"))] + fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) { + let command = Commands::Run { + input_file: input_path.to_owned(), + output_file: output_path.to_owned(), + dry_run: false, + allow_debug: true, + }; + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + + let err = cli.run(&mut output).unwrap_err(); + println!("err = {:?}", err); + + let output = String::from_utf8(output.into_inner()).unwrap(); + assert!(output.contains("\"multi-vector\" feature")); + println!("output = {}", output); + + // The output file should not have been created because we failed the test. + assert!(!output_path.exists()); + } + + #[test] + #[cfg(feature = "multi-vector")] + fn multi_vector_check_verify() { + let input_path = example_directory().join("multi-vector.json"); + let tolerance_path = project_directory() + .join("perf_test_inputs") + .join("multi-vector-tolerance.json"); + + let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify { + tolerances: tolerance_path, + input_file: input_path, + }); + + let cli = Cli::from_commands(command, true); + let mut output = Memory::new(); + cli.run(&mut output).unwrap(); + println!( + "output = {}", + String::from_utf8(output.into_inner()).unwrap() + ); + } + #[test] fn quiet_suppresses_check_target_warning() { let cli = Cli::from_commands(Commands::Skeleton, true); diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs index a535c68dc..e6dc8a772 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs @@ -34,7 +34,7 @@ impl where A: Architecture, F32Kernel: Kernel, - layouts::BlockTransposed: layouts::ConvertTo as Kernel>::Left> + layouts::BlockTransposedLayout: layouts::ConvertTo as Kernel>::Left> + layouts::Layout, layouts::RowMajor: layouts::ConvertTo as Kernel>::Right> + layouts::Layout, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs index a900ea356..602da6324 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs @@ -30,7 +30,7 @@ mod scalar; mod v3; /// Zero-sized kernel type for f32 micro-kernels with block size `GROUP`. -pub(crate) struct F32Kernel; +pub struct F32Kernel; #[inline(never)] #[cold] @@ -66,7 +66,7 @@ pub(super) fn max_ip_kernel( budget: TileBudget, ) where F32Kernel: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: layouts::ConvertTo as Kernel>::Left> + layouts::Layout, layouts::RowMajor: layouts::ConvertTo as Kernel>::Right> + layouts::Layout, @@ -117,7 +117,7 @@ impl where A: Architecture, Self: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: layouts::ConvertTo>::Left> + layouts::Layout, layouts::RowMajor: layouts::ConvertTo>::Right> + layouts::Layout, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs index bd8fb1c4a..2a230ca62 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs @@ -27,7 +27,7 @@ diskann_wide::alias!(f32s = ::f32x8); // A_PANEL(8) * k A elements, UNROLL * k B elements, and A_PANEL(8) // scratch elements — all within the bounds guaranteed by `tiled_reduce`. unsafe impl Kernel for F32Kernel<8> { - type Left = layouts::BlockTransposed; + type Left = layouts::BlockTransposedLayout; type Right = layouts::RowMajor; const A_PANEL: usize = 8; const B_PANEL: usize = 2; diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs index b05195b1e..319cf1cda 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs @@ -17,7 +17,7 @@ diskann_wide::alias!(f32s = ::f32x8); // A_PANEL(16) * k A elements, UNROLL * k B elements, and A_PANEL(16) // scratch elements — all within the bounds guaranteed by `tiled_reduce`. unsafe impl Kernel for F32Kernel<16> { - type Left = layouts::BlockTransposed; + type Left = layouts::BlockTransposedLayout; type Right = layouts::RowMajor; const A_PANEL: usize = 16; const B_PANEL: usize = 4; diff --git a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs index e1ec8dd36..54962acaa 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs @@ -4,7 +4,7 @@ //! Layout markers and tile-level conversion traits. //! //! - [`Layout`] — marker trait: memory layout + element type. -//! - [`BlockTransposed`] / [`RowMajor`] — zero-sized layout markers. +//! - [`BlockTransposedLayout`] / [`RowMajor`] — zero-sized layout markers. //! - [`DescribeLayout`] — bridges matrix types to layout markers. //! - [`ConvertTo`] — tile-level conversion (blanket identity + f16→f32). @@ -17,7 +17,7 @@ use diskann_wide::arch::Target2; // ── Layout trait ───────────────────────────────────── /// Memory layout and element type marker for tile data. -pub(super) trait Layout { +pub trait Layout { type Element: Copy; } @@ -25,28 +25,36 @@ pub(super) trait Layout { /// Block-transposed tile layout: `GROUP` rows per block, `PACK` columns /// interleaved. Matches [`BlockTransposedRef`](crate::multi_vector::BlockTransposedRef). -pub(super) struct BlockTransposed(PhantomData); - -impl BlockTransposed { +/// +/// This is the zero-sized **layout marker** used in [`Kernel::Left`] / +/// [`Kernel::Right`](super::Kernel) associated types. It is distinct +/// from the owning storage type [`BlockTransposed`](crate::multi_vector::BlockTransposed) +/// — the marker carries layout information at the type level; the owning +/// type holds actual data. +pub struct BlockTransposedLayout(PhantomData); + +impl BlockTransposedLayout { pub(super) fn new() -> Self { Self(PhantomData) } } -impl Copy for BlockTransposed {} +impl Copy for BlockTransposedLayout {} -impl Clone for BlockTransposed { +impl Clone for BlockTransposedLayout { fn clone(&self) -> Self { *self } } -impl Layout for BlockTransposed { +impl Layout + for BlockTransposedLayout +{ type Element = T; } /// Dense row-major tile layout. Matches [`MatRef>`](crate::multi_vector::MatRef). -pub(super) struct RowMajor(PhantomData); +pub struct RowMajor(PhantomData); impl RowMajor { pub(super) fn new() -> Self { @@ -70,7 +78,7 @@ impl Layout for RowMajor { /// Bridges a concrete matrix type to its [`Layout`] marker, enabling /// type inference of [`ConvertTo`] parameters at call sites. -pub(super) trait DescribeLayout { +pub trait DescribeLayout { type Layout: Layout; fn layout(&self) -> Self::Layout; @@ -79,10 +87,10 @@ pub(super) trait DescribeLayout { impl DescribeLayout for crate::multi_vector::BlockTransposedRef<'_, T, GROUP, PACK> { - type Layout = BlockTransposed; + type Layout = BlockTransposedLayout; fn layout(&self) -> Self::Layout { - BlockTransposed::new() + BlockTransposedLayout::new() } } @@ -108,7 +116,7 @@ impl DescribeLayout for crate::multi_vector::MatRef<'_, crate::multi_ve /// - `convert` reads at most `rows * k` source elements. /// - `convert` writes only within `buf`. /// - The returned pointer is valid until the next `&mut` access to `buf`. -pub(super) unsafe trait ConvertTo: Layout { +pub unsafe trait ConvertTo: Layout { /// Staging buffer for converted tile data (`()` for identity conversions). type Buffer; @@ -162,7 +170,8 @@ unsafe impl ConvertTo for L { // into `rows * k` f32 values in `buf`. The returned pointer is // `buf.as_ptr()`, valid until the next `&mut` access to `buf`. unsafe impl - ConvertTo> for BlockTransposed + ConvertTo> + for BlockTransposedLayout where A: Architecture, SliceCast: for<'a> Target2, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs index bd9121a24..e7fbd16ed 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs @@ -12,27 +12,45 @@ //! - **Query**: Block-transposed (`GROUP` vectors per block, dimensions contiguous //! within each block). The block size is determined by the kernel's `A_PANEL`. //! - **Document**: Row-major (standard [`MatRef`](crate::multi_vector::MatRef) format). +//! +//! The [`Kernel`] trait + [`tiled_reduce`] + [`layouts`] are public so +//! external crates can plug new micro-kernels into the existing orchestrator. pub(super) mod f16; pub(super) mod f32; -mod layouts; +pub mod layouts; mod reduce; mod tiled_reduce; +pub use tiled_reduce::tiled_reduce; + // ── Tile budget ────────────────────────────────────────────────── /// Cache budgets fed to the tile planner. /// /// `Default` returns the production budgets derived from hardcoded L1/L2 -/// cache-size estimates and fixed fractions. +/// cache-size estimates and fixed fractions. Researchers benchmarking with +/// non-default cache assumptions can construct a custom [`TileBudget`] via +/// [`TileBudget::new`] and pass it to [`tiled_reduce`]. #[derive(Debug, Clone, Copy)] -struct TileBudget { +pub struct TileBudget { /// L2 budget in bytes reserved for A tiles. l2_a: usize, /// L1 budget in bytes reserved for B tiles (before A-panel subtraction). l1_b: usize, } +impl TileBudget { + /// Construct a [`TileBudget`] with explicit L2 and L1 byte budgets. + /// + /// `l2_a` is the budget the tile planner uses to size A tiles; `l1_b` + /// is the budget for B tiles (one A micro-panel is subtracted at + /// runtime since both must coexist in L1 during the inner loop). + pub fn new(l2_a: usize, l1_b: usize) -> Self { + Self { l2_a, l1_b } + } +} + impl Default for TileBudget { // TODO: Replace hardcoded fallbacks with detected cache sizes // (e.g. via `diskann_platform`, env-var override, or runtime query). @@ -51,7 +69,7 @@ impl Default for TileBudget { // ── Kernel trait ───────────────────────────────────────────────── -/// SIMD micro-kernel for the [`tiled_reduce`](tiled_reduce::tiled_reduce) loop. +/// SIMD micro-kernel for the [`tiled_reduce`] loop. /// /// The kernel only sees already-converted data: storage-layout to /// kernel-layout conversion is handled at tile boundaries by @@ -59,17 +77,32 @@ impl Default for TileBudget { /// pointers reference `::Element` / /// `::Element` directly. /// +/// # Invariant +/// +/// When pairing this kernel with the owning storage type +/// [`BlockTransposed`](crate::multi_vector::BlockTransposed) via +/// [`tiled_reduce`], the storage's `GROUP` const must equal this kernel's +/// [`A_PANEL`](Self::A_PANEL). The library's own f32 / f16 paths enforce +/// this with a `const { assert!(...) }` in `max_ip_kernel`; external +/// implementors must uphold it manually. +/// /// # Safety /// /// Implementors must respect the per-method `# Safety` contracts on /// [`full_panel`](Self::full_panel) and [`partial_panel`](Self::partial_panel). -unsafe trait Kernel { +/// Implementations should be validated under Miri: construct arch tokens +/// via `*::new_checked_miri()` in tests and gate Miri-unsupported +/// intrinsics with `#[cfg(not(miri))]`. +pub unsafe trait Kernel { /// Layout consumed by the A (left / query) side of the micro-kernel. type Left: layouts::Layout; /// Layout consumed by the B (right / document) side of the micro-kernel. type Right: layouts::Layout; /// Number of A rows processed per micro-kernel invocation. + /// + /// Callers of [`tiled_reduce`] must guarantee + /// `a_padded_nrows % A_PANEL == 0`. const A_PANEL: usize; /// Number of B rows processed per micro-kernel invocation. const B_PANEL: usize; @@ -79,10 +112,13 @@ unsafe trait Kernel { /// # Safety /// /// * `a` must point to `A_PANEL * k` contiguous elements of - /// `::Element`. + /// `::Element`, properly aligned for that layout. /// * `b` must point to `B_PANEL * k` contiguous elements of - /// `::Element`. + /// `::Element`, properly aligned for that layout. /// * `r` must point to at least `A_PANEL` writable `f32` values. + /// * `k > 0`. + /// * The caller must invoke this from within an `arch.run3` (or + /// equivalent) so that target_feature is active for the entire body. unsafe fn full_panel( arch: A, a: *const ::Element, @@ -95,11 +131,8 @@ unsafe trait Kernel { /// /// # Safety /// - /// * `a` must point to `A_PANEL * k` contiguous elements of - /// `::Element`. - /// * `b` must point to `remainder * k` contiguous elements of - /// `::Element`. - /// * `r` must point to at least `A_PANEL` writable `f32` values. + /// Same as [`full_panel`](Self::full_panel) except `b` points to + /// `remainder * k` contiguous elements and `1 <= remainder < B_PANEL`. unsafe fn partial_panel( arch: A, remainder: usize, diff --git a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs index ff873c01f..285d823b6 100644 --- a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs +++ b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs @@ -89,7 +89,7 @@ impl FullReduce { /// * `b_ptr` must be valid for `b_nrows * k` elements of `BElem`. /// * `scratch` must have length ≥ `a_padded_nrows` and be initialized by caller. #[allow(clippy::too_many_arguments)] -pub(super) unsafe fn tiled_reduce( +pub unsafe fn tiled_reduce( arch: A, ca: &LA, cb: &LB, @@ -343,7 +343,7 @@ mod tests { let b = vec![0.0f32; 2 * k]; let mut scratch = vec![f32::MIN; 16]; - let ca = layouts::BlockTransposed::::new(); + let ca = layouts::BlockTransposedLayout::::new(); let cb = layouts::RowMajor::::new(); // SAFETY: pointers and scratch are correctly sized; we expect a panic. @@ -373,7 +373,7 @@ mod tests { let b = Vec::::new(); let mut scratch = vec![f32::MIN; a_rows]; - let ca = layouts::BlockTransposed::::new(); + let ca = layouts::BlockTransposedLayout::::new(); let cb = layouts::RowMajor::::new(); // SAFETY: k == 0 so no elements are read; pointers are never dereferenced. @@ -402,7 +402,7 @@ mod tests { let a_rows = 8; let mut scratch = vec![f32::MIN; a_rows]; - let ca = layouts::BlockTransposed::::new(); + let ca = layouts::BlockTransposedLayout::::new(); let cb = layouts::RowMajor::::new(); // SAFETY: k == 0, b_nrows == 0; no elements read. @@ -516,7 +516,7 @@ mod tests { A: Architecture, T: Copy + Default, F32Kernel: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: ConvertTo as Kernel>::Left> + Layout, layouts::RowMajor: ConvertTo as Kernel>::Right> + Layout, @@ -698,7 +698,7 @@ mod tests { A: Architecture, T: Copy + Default, F32Kernel: Kernel, - layouts::BlockTransposed: + layouts::BlockTransposedLayout: ConvertTo as Kernel>::Left> + Layout, layouts::RowMajor: ConvertTo as Kernel>::Right> + Layout, diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs index 853f60753..354a47afe 100644 --- a/diskann-quantization/src/multi_vector/distance/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/mod.rs @@ -50,10 +50,10 @@ //! ``` mod fallback; -mod kernels; +pub mod kernels; mod max_sim; mod query_computer; pub use fallback::QueryMatRef; pub use max_sim::{Chamfer, MaxSim, MaxSimError}; -pub use query_computer::QueryComputer; +pub use query_computer::{DynQueryComputer, QueryComputer}; diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs index 9bb348a6a..cb575c09f 100644 --- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs +++ b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs @@ -19,6 +19,18 @@ impl QueryComputer { pub fn new(query: MatRef<'_, Standard>) -> Self { diskann_wide::arch::dispatch1_no_features(BuildComputer, query) } + + /// Build an f16 query computer pinned to a specific architecture token. + /// + /// See [`QueryComputer::::from_arch`] for the rationale and usage. + pub fn from_arch(query: MatRef<'_, Standard>, arch: A) -> Self + where + A: Architecture, + BuildComputer: + for<'a> diskann_wide::arch::Target1>>, + { + arch.run1(BuildComputer, query) + } } impl DynQueryComputer @@ -51,8 +63,9 @@ where } } +/// Architecture-dispatch target for `QueryComputer::` construction. #[derive(Debug, Clone, Copy)] -pub(super) struct BuildComputer; +pub struct BuildComputer; impl diskann_wide::arch::Target1, MatRef<'_, Standard>> for BuildComputer diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs index 9ff16b8b4..f8193ad7e 100644 --- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs +++ b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs @@ -19,6 +19,20 @@ impl QueryComputer { pub fn new(query: MatRef<'_, Standard>) -> Self { diskann_wide::arch::dispatch1_no_features(BuildComputer, query) } + + /// Build an f32 query computer pinned to a specific architecture token. + /// + /// The caller obtains the token via `Scalar::new()` (always available) + /// or `V3::new_checked()` / `V4::new_checked()` / `Neon::new_checked()` + /// (which check CPU support). Use this constructor to A/B compare kernels + /// across ISAs on the same machine. + pub fn from_arch(query: MatRef<'_, Standard>, arch: A) -> Self + where + A: Architecture, + BuildComputer: for<'a> diskann_wide::arch::Target1>>, + { + arch.run1(BuildComputer, query) + } } impl DynQueryComputer for Prepared> @@ -50,8 +64,9 @@ where } } +/// Architecture-dispatch target for `QueryComputer::` construction. #[derive(Debug, Clone, Copy)] -pub(super) struct BuildComputer; +pub struct BuildComputer; impl diskann_wide::arch::Target1, MatRef<'_, Standard>> for BuildComputer diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs index fbe84fcd3..efd552076 100644 --- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs +++ b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs @@ -43,6 +43,16 @@ pub struct QueryComputer { } impl QueryComputer { + /// Wrap any [`DynQueryComputer`] implementation as a `QueryComputer`. + /// + /// This is the public seam for experimental kernels: implement + /// [`DynQueryComputer`] on a custom struct, then wrap it here so the + /// existing [`QueryComputer::max_sim`] / [`QueryComputer::chamfer`] veneer + /// works against it. + pub fn from_dyn(inner: Box>) -> Self { + Self { inner } + } + /// Number of logical (non-padded) query vectors. #[inline] pub fn nrows(&self) -> usize { @@ -88,8 +98,27 @@ impl QueryComputer { } } -trait DynQueryComputer: std::fmt::Debug + Send + Sync { +/// Object-safe interface for "anything that can compute MaxSim for a [`QueryComputer`]". +/// +/// The library's own architecture-dispatched path implements this on the +/// internal `Prepared` carriers. External crates implement it on their +/// own structs and wrap via [`QueryComputer::from_dyn`]. +/// +/// # Contract +/// +/// - [`compute_max_sim`](Self::compute_max_sim) is only invoked by +/// [`QueryComputer::max_sim`], which has already asserted +/// `scores.len() == self.nrows()` and short-circuited the zero-doc case. +/// Implementations may rely on `scores.len() == self.nrows()` and +/// `doc.num_vectors() > 0`. +/// - Implementations must populate all `nrows()` entries of `scores`. +/// [`QueryComputer::chamfer`] sums every entry, so leaving any trailing +/// slot unwritten would silently corrupt the result. +pub trait DynQueryComputer: std::fmt::Debug + Send + Sync { + /// Compute MaxSim into `scores`. See trait-level docs for the contract. fn compute_max_sim(&self, doc: MatRef<'_, Standard>, scores: &mut [f32]); + + /// Number of query rows. fn nrows(&self) -> usize; } @@ -287,4 +316,38 @@ mod tests { test_matches_fallback!(f32, f32, 1e-10, "f32 "); test_matches_fallback!(f16, half::f16, 1e-10, "f16 "); + + // ============================================================ + // from_dyn: wrap a trivial custom DynQueryComputer. + // ============================================================ + #[derive(Debug)] + struct ConstantComputer { + nrows: usize, + value: f32, + } + + impl DynQueryComputer for ConstantComputer { + fn compute_max_sim(&self, _doc: MatRef<'_, Standard>, scores: &mut [f32]) { + for s in scores.iter_mut() { + *s = self.value; + } + } + fn nrows(&self) -> usize { + self.nrows + } + } + + #[test] + fn from_dyn_wraps_custom_impl() { + let computer = QueryComputer::::from_dyn(Box::new(ConstantComputer { + nrows: 3, + value: -1.5, + })); + assert_eq!(computer.nrows(), 3); + + let doc = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2); + let mut scores = vec![0.0f32; 3]; + computer.max_sim(doc, &mut scores); + assert_eq!(scores, vec![-1.5, -1.5, -1.5]); + } } diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs index 70629d44c..734cb7247 100644 --- a/diskann-quantization/src/multi_vector/matrix.rs +++ b/diskann-quantization/src/multi_vector/matrix.rs @@ -712,6 +712,22 @@ impl Clone for Mat { } impl Mat> { + /// Create a new matrix by invoking `f` once per element in row-major order. + /// + /// # Example + /// ``` + /// use diskann_quantization::multi_vector::{Mat, Standard}; + /// let mut n = 0; + /// let mat = Mat::from_fn(Standard::::new(1, 4).unwrap(), || { n += 1; n }); + /// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]); + /// ``` + pub fn from_fn T>(repr: Standard, mut f: F) -> Self { + let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect(); + + // SAFETY: By construction, `b` has length `repr.num_elements()`. + unsafe { repr.box_to_mat(b) } + } + /// Returns the raw dimension (columns) of the vectors in the matrix. #[inline] pub fn vector_dim(&self) -> usize { @@ -1767,6 +1783,18 @@ mod tests { } } + #[test] + fn test_standard_from_fn() { + let mut counter: i32 = 0; + let m = Mat::from_fn(Standard::::new(2, 3).unwrap(), || { + let v = counter; + counter += 1; + v + }); + + assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]); + } + #[test] fn matref_new_slice_length_error() { let repr = Standard::::new(3, 4).unwrap();