diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml
index bebaf4b8e..ecc3a53dd 100644
--- a/diskann-benchmark/Cargo.toml
+++ b/diskann-benchmark/Cargo.toml
@@ -63,6 +63,9 @@ scalar-quantization = []
 # Enable minmax-quantization based algorithms
 minmax-quantization = []
 
+# Enable multi-vector MaxSim distance benchmarks
+multi-vector = []
+
 # Enable Disk Index benchmarks
 disk-index = [
     "diskann-disk/perf_test",
diff --git a/diskann-benchmark/example/multi-vector.json b/diskann-benchmark/example/multi-vector.json
new file mode 100644
index 000000000..7a4e59539
--- /dev/null
+++ b/diskann-benchmark/example/multi-vector.json
@@ -0,0 +1,47 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
new file mode 100644
index 000000000..8d5997199
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
@@ -0,0 +1,16 @@
+{
+  "checks": [
+    {
+      "input": {
+        "type": "multi-vector-op",
+        "content": {}
+      },
+      "tolerance": {
+        "type": "multi-vector-tolerance",
+        "content": {
+          "min_time_regression": 0.05
+        }
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/perf_test_inputs/multi-vector.json b/diskann-benchmark/perf_test_inputs/multi-vector.json
new file mode 100644
index 000000000..57922fe10
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/multi-vector.json
@@ -0,0 +1,149 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "auto",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "scalar",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "arch": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "x86-64-v3",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "x86-64-v4",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "arch": "reference",
+        "runs": [
+          { "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs
index 24fe91d7e..0d1c61345 100644
--- a/diskann-benchmark/src/backend/mod.rs
+++ b/diskann-benchmark/src/backend/mod.rs
@@ -7,10 +7,12 @@ mod disk_index;
 mod exhaustive;
 mod filters;
 mod index;
+mod multi_vector;
 
 pub(crate) fn register_benchmarks(registry: &mut diskann_benchmark_runner::registry::Benchmarks) {
     exhaustive::register_benchmarks(registry);
     disk_index::register_benchmarks(registry);
     index::register_benchmarks(registry);
     filters::register_benchmarks(registry);
+    multi_vector::register_benchmarks(registry);
 }
diff --git a/diskann-benchmark/src/backend/multi_vector/driver.rs b/diskann-benchmark/src/backend/multi_vector/driver.rs
new file mode 100644
index 000000000..2f83eb22f
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/driver.rs
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Shared benchmark infrastructure for multi-vector kernels.
+//!
+//! Houses the timing harness ([`run_loops`]), data fixtures ([`Data`]), result
+//! types ([`RunResult`], [`Comparison`], [`CheckResult`]), and the trait-object
+//! [`Distance<T>`] boundary that both library and experimental kernels go
+//! through. None of the contents are kernel-aware.
+
+use diskann_benchmark_runner::utils::{
+    fmt::Table, num::relative_change, percentiles, MicroSeconds,
+};
+use diskann_quantization::multi_vector::distance::QueryMatRef;
+use diskann_quantization::multi_vector::{Mat, MatRef, MaxSim, QueryComputer, Standard};
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+use rand::{
+    distr::{Distribution, StandardUniform},
+    rngs::StdRng,
+    SeedableRng,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::multi_vector::{MultiVectorTolerance, Run};
+
+///////////////////
+// Data fixtures //
+///////////////////
+
+/// Random query / doc fixture for a single benchmark run.
+pub(super) struct Data<T: Copy> {
+    pub(super) queries: Mat<Standard<T>>,
+    pub(super) docs: Mat<Standard<T>>,
+}
+
+impl<T: Copy> Data<T>
+where
+    StandardUniform: Distribution<T>,
+{
+    pub(super) fn new(run: &Run) -> Self {
+        let mut rng = StdRng::seed_from_u64(0x12345);
+        let queries = Mat::from_fn(
+            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        let docs = Mat::from_fn(
+            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+            || StandardUniform.sample(&mut rng),
+        );
+        Self { queries, docs }
+    }
+}
+
+//////////////////////
+// Distance kernels //
+//////////////////////
+
+/// Object-safe abstraction over a per-shape distance executor.
+///
+/// `OptimizedDistance` wraps any [`QueryComputer<T>`] — library-shipped
+/// arch-pinned ones (via `from_arch`) AND experimental ones (via
+/// `from_dyn`) — so the driver's hot loop dispatches through one vtable
+/// hop regardless of which kernel produced the computer.
+/// `ReferenceDistance` is the only path that doesn't go through
+/// `QueryComputer` (it uses the `MaxSim` fallback directly).
+pub(super) trait Distance<T: Copy> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+}
+
+/// Distance executor wrapping a [`QueryComputer<T>`] — covers all arch-pinned,
+/// auto-dispatched, and experimental kernels.
+pub(super) struct OptimizedDistance<T: Copy>(pub(super) QueryComputer<T>);
+
+impl<T: Copy> Distance<T> for OptimizedDistance<T> {
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        self.0.max_sim(doc, scores);
+    }
+}
+
+/// Distance executor driving the [`MaxSim`] fallback path.
+pub(super) struct ReferenceDistance<'a, T: Copy>(pub(super) QueryMatRef<'a, Standard<T>>);
+
+impl<T: Copy> Distance<T> for ReferenceDistance<'_, T>
+where
+    InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>,
+{
+    fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+        // `MaxSim::new` is a non-empty check + pointer wrap, free per iteration.
+        let mut max_sim = MaxSim::new(scores).unwrap();
+        let _ = max_sim.evaluate(self.0, doc);
+    }
+}
+
+//////////////////////
+// Timing harness   //
+//////////////////////
+
+fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+where
+    F: FnMut(),
+{
+    let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+    for _ in 0..run.num_measurements.get() {
+        let start = std::time::Instant::now();
+        for _ in 0..run.loops_per_measurement.get() {
+            body();
+        }
+        latencies.push(start.elapsed().into());
+    }
+
+    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+    RunResult {
+        run: run.clone(),
+        latencies,
+        percentiles,
+    }
+}
+
+/// Shared loop nest. The trait-object dispatch happens once per outer iteration
+/// of `run_loops`; the work inside each `max_sim` call is O(Q·D·dim), so the
+/// vtable hop is in the noise.
+pub(super) fn run_with_distance<T: Copy>(
+    run: &Run,
+    doc: MatRef<'_, Standard<T>>,
+    dist: &dyn Distance<T>,
+) -> RunResult {
+    let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+    run_loops(run, || {
+        dist.max_sim(doc, &mut scores);
+        std::hint::black_box(&mut scores);
+    })
+}
+
+//////////////////////
+// Result types     //
+//////////////////////
+
+#[derive(Debug, Clone, Copy)]
+pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
+
+impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.0
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct RunResult {
+    /// The configuration for this run.
+    pub(super) run: Run,
+    /// Per-measurement latencies (over `loops_per_measurement` calls).
+    pub(super) latencies: Vec<MicroSeconds>,
+    /// Latency percentiles.
+    pub(super) percentiles: percentiles::Percentiles<MicroSeconds>,
+}
+
+impl RunResult {
+    pub(super) fn computations_per_latency(&self) -> usize {
+        self.run.num_query_vectors.get()
+            * self.run.num_doc_vectors.get()
+            * self.run.loops_per_measurement.get()
+    }
+}
+
+impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        writeln!(
+            f,
+            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+        )?;
+
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Time (ns/IP @ Dim)",
+            "Mean Time (ns/IP @ Dim)",
+            "Loops",
+            "Measurements",
+        ];
+
+        let mut table = Table::new(header, self.len());
+
+        self.iter().enumerate().for_each(|(row, r)| {
+            let mut row = table.row(row);
+
+            let min_latency = r
+                .latencies
+                .iter()
+                .min()
+                .copied()
+                .unwrap_or(MicroSeconds::new(u64::MAX));
+            let mean_latency = r.percentiles.mean;
+
+            let computations_per_latency = r.computations_per_latency() as f64;
+            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+            let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+            row.insert(r.run.num_query_vectors, 0);
+            row.insert(r.run.num_doc_vectors, 1);
+            row.insert(r.run.dim, 2);
+            row.insert(format!("{:.3}", min_time), 3);
+            row.insert(format!("{:.3}", mean_time), 4);
+            row.insert(r.run.loops_per_measurement, 5);
+            row.insert(r.run.num_measurements, 6);
+        });
+
+        table.fmt(f)
+    }
+}
+
+//////////////////////
+// Regression Check //
+//////////////////////
+
+/// Per-run comparison result showing before/after percentile differences.
+#[derive(Debug, Serialize)]
+pub(super) struct Comparison {
+    pub(super) run: Run,
+    pub(super) tolerance: MultiVectorTolerance,
+    pub(super) before_min: f64,
+    pub(super) after_min: f64,
+}
+
+/// Aggregated result of the regression check across all runs.
+#[derive(Debug, Serialize)]
+pub(super) struct CheckResult {
+    pub(super) checks: Vec<Comparison>,
+}
+
+impl std::fmt::Display for CheckResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let header = [
+            "Q",
+            "D",
+            "Dim",
+            "Min Before (ns/IP @ Dim)",
+            "Min After (ns/IP @ Dim)",
+            "Change (%)",
+            "Remark",
+        ];
+
+        let mut table = Table::new(header, self.checks.len());
+
+        for (i, c) in self.checks.iter().enumerate() {
+            let mut row = table.row(i);
+            let change = relative_change(c.before_min, c.after_min);
+
+            row.insert(c.run.num_query_vectors, 0);
+            row.insert(c.run.num_doc_vectors, 1);
+            row.insert(c.run.dim, 2);
+            row.insert(format!("{:.3}", c.before_min), 3);
+            row.insert(format!("{:.3}", c.after_min), 4);
+            match change {
+                Ok(change) => {
+                    row.insert(format!("{:.3} %", change * 100.0), 5);
+                    if change > c.tolerance.min_time_regression.get() {
+                        row.insert("FAIL", 6);
+                    }
+                }
+                Err(err) => {
+                    row.insert("invalid", 5);
+                    row.insert(err, 6);
+                }
+            }
+        }
+
+        table.fmt(f)
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs
new file mode 100644
index 000000000..b0e106fc4
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/experimental/mod.rs
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Researcher-authored experimental multi-vector kernels.
+//!
+//! See [`template`] for the full kernel-author workflow (writing a `Kernel<A>`
+//! impl, adapting it via `DynQueryComputer<T>`, wiring up dispatch and
+//! registration, and validating under Miri).
+//!
+//! New experimental kernels live in their own module file in this directory.
+//! Their registration goes in [`register`] below.
+
+use diskann_benchmark_runner::registry::Benchmarks;
+
+mod template;
+
+pub(super) fn register(_benchmarks: &mut Benchmarks) {
+    // No experimental kernels registered by default.
+    // Add `benchmarks.register_regression(...)` calls here when authoring
+    // new experimental kernels.
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/experimental/template.rs b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
new file mode 100644
index 000000000..64eeb3e00
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/experimental/template.rs
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! **Template for an experimental multi-vector kernel.**
+//!
+//! Copy this file (e.g. to `v4_wide.rs`), rename `Template*`, change the
+//! `Kernel<A>` impl to your target ISA, and add an `Arch` variant + a
+//! `register_regression` call to wire it up.
+//!
+//! # The 5-step workflow
+//!
+//! 1. **Add an [`Arch`](crate::inputs::multi_vector::Arch) variant** for your
+//!    experimental kernel (e.g. `X86_64_V4_Wide`). The `#[non_exhaustive]`
+//!    attribute on `Arch` makes this a non-breaking addition.
+//! 2. **Author the micro-kernel.** Implement
+//!    [`Kernel<A>`](diskann_quantization::multi_vector::distance::kernels::Kernel)
+//!    on your struct (`unsafe impl Kernel<V4>` etc.), filling in
+//!    `full_panel` and `partial_panel` with your SIMD intrinsics.
+//! 3. **Author the adapter.** Implement
+//!    [`DynQueryComputer<T>`](diskann_quantization::multi_vector::distance::DynQueryComputer)
+//!    on a struct that owns the prepared query data; in `compute_max_sim`,
+//!    call
+//!    [`tiled_reduce`](diskann_quantization::multi_vector::distance::kernels::tiled_reduce)
+//!    with your kernel.
+//! 4. **Add a marker + `DispatchRule<Arch>`.** Mirror the pattern in
+//!    `library_kernels.rs` (e.g. `match_arch_x86_64!`) for your new variant.
+//! 5. **Add a `RunBenchmark<Marker>` impl + `register_regression` call.** Use
+//!    `Kernel::<Marker, T>::new()` as the registered benchmark entry.
+//!
+//! Then validate under Miri before treating the kernel as correct — see the
+//! section below.
+//!
+//! # Validating under Miri (REQUIRED)
+//!
+//! Experimental kernels rely on `unsafe fn full_panel` / `partial_panel`
+//! with raw-pointer arithmetic. Pointer provenance, alignment, and
+//! out-of-bounds bugs are easy to introduce and hard to catch by
+//! inspection. **Run your kernel under Miri before assuming it's correct.**
+//!
+//! Rules:
+//!
+//! - Inside your `#[cfg(test)]` module, construct arch tokens via the
+//!   Miri-friendly variants: `Scalar::new()` (always Miri-safe) or
+//!   `V4::new_checked_miri()` (returns a token unconditionally under
+//!   `cfg(miri)` using AVX-512 emulation, so tests run even when Miri
+//!   can't do real CPU detection). `V3` and `Neon` only expose
+//!   `new_checked()` today — if you need them under Miri, follow
+//!   `V4::new_checked_miri()`'s pattern in `diskann-wide`.
+//! - Any SIMD intrinsic Miri doesn't support must have a scalar fallback
+//!   gated by `#[cfg(miri)]`.
+//! - Add at least one small-shape correctness test that runs your kernel
+//!   against a naive reference and is Miri-friendly.
+//! - Run: `cargo +nightly miri test -p diskann-benchmark --features multi-vector
+//!   backend::multi_vector::experimental::<your_kernel>`. Reduce
+//!   test-sweep size under Miri with `if cfg!(miri) { small } else { full }`
+//!   (see this file's test for the pattern).
+//!
+//! Miri won't catch performance bugs, but it'll catch UB — and UB in an
+//! experimental kernel breaks the benchmark binary, not the kernel you're
+//! trying to measure.
+//!
+//! # This template
+//!
+//! This file defines `TemplateKernel: Kernel<Scalar>` (uses `Scalar` so the
+//! template is host-portable + Miri-friendly) and a `TemplateComputer`
+//! adapter that pipes it through `tiled_reduce`. It is **not registered** as
+//! a benchmark entry — see step 5 in the workflow. The included
+//! `#[cfg(test)]` `template_matches_pinned_scalar` test exercises the API
+//! surface end-to-end so this file catches public-API drift even though it
+//! isn't wired into the benchmark dispatcher.
+
+#![allow(dead_code)]
+
+use diskann_quantization::multi_vector::distance::{
+    kernels::{layouts, tiled_reduce, Kernel, TileBudget},
+    DynQueryComputer,
+};
+use diskann_quantization::multi_vector::{BlockTransposed, BlockTransposedRef, MatRef, Standard};
+use diskann_wide::arch::Scalar;
+
+/// Step 2: the micro-kernel struct. Rename and implement for your target arch.
+pub(super) struct TemplateKernel;
+
+// SAFETY: `full_panel` / `partial_panel` only access `A_PANEL * k` /
+// `B_PANEL * k` source elements and write `A_PANEL` destination f32s,
+// matching `Kernel<Scalar>`'s safety contract. The simple scalar
+// computation here is Miri-clean.
+unsafe impl Kernel<Scalar> for TemplateKernel {
+    type Left = layouts::BlockTransposedLayout<f32, 8>;
+    type Right = layouts::RowMajor<f32>;
+    const A_PANEL: usize = 8;
+    const B_PANEL: usize = 2;
+
+    unsafe fn full_panel(_arch: Scalar, a: *const f32, b: *const f32, k: usize, r: *mut f32) {
+        // SAFETY: a covers A_PANEL * k contiguous block-transposed f32s,
+        // b covers B_PANEL * k contiguous row-major f32s, r covers A_PANEL f32s.
+        unsafe { panel::<8, 2>(a, b, k, r) }
+    }
+
+    unsafe fn partial_panel(
+        _arch: Scalar,
+        remainder: usize,
+        a: *const f32,
+        b: *const f32,
+        k: usize,
+        r: *mut f32,
+    ) {
+        debug_assert!(remainder == 1);
+        // SAFETY: as full_panel but with `b` covering `remainder * k` f32s.
+        unsafe { panel::<8, 1>(a, b, k, r) }
+    }
+}
+
+/// Replace this with your SIMD intrinsics. The block-transposed A layout
+/// stores `A_ROWS` contiguous f32s per dimension index `i`, so the q-th
+/// query row at dimension i lives at `a[i * A_ROWS + q]`. The row-major B
+/// layout stores doc d's k-th element at `b[d * k_dim + k]`. The scratch
+/// `r` accumulates max IP per query row (library convention; the
+/// `QueryComputer` veneer negates at the end).
+///
+/// # Safety
+/// - `a` covers `A_ROWS * k` block-transposed f32s.
+/// - `b` covers `B_ROWS * k` row-major f32s.
+/// - `r` covers `A_ROWS` writable f32s.
+unsafe fn panel<const A_ROWS: usize, const B_ROWS: usize>(
+    a: *const f32,
+    b: *const f32,
+    k: usize,
+    r: *mut f32,
+) {
+    for q in 0..A_ROWS {
+        // SAFETY: q < A_ROWS.
+        let mut best = unsafe { *r.add(q) };
+
+        for d in 0..B_ROWS {
+            let mut ip: f32 = 0.0;
+            for i in 0..k {
+                // SAFETY: i < k, q < A_ROWS.
+                let a_val = unsafe { *a.add(i * A_ROWS + q) };
+                // SAFETY: d < B_ROWS, b covers B_ROWS rows of k f32s each.
+                let b_val = unsafe { *b.add(d * k + i) };
+                ip += a_val * b_val;
+            }
+            best = best.max(ip);
+        }
+
+        // SAFETY: q < A_ROWS.
+        unsafe { *r.add(q) = best };
+    }
+}
+
+/// Step 3: the `DynQueryComputer<T>` adapter. Owns the prepared query data
+/// and routes `compute_max_sim` through `tiled_reduce` with the kernel.
+#[derive(Debug)]
+pub(super) struct TemplateComputer {
+    arch: Scalar,
+    prepared: BlockTransposed<f32, 8>,
+}
+
+impl TemplateComputer {
+    pub(super) fn new(query: MatRef<'_, Standard<f32>>) -> Self {
+        let prepared = BlockTransposed::<f32, 8>::from_matrix_view(query.as_matrix_view());
+        Self {
+            arch: Scalar::new(),
+            prepared,
+        }
+    }
+}
+
+impl DynQueryComputer<f32> for TemplateComputer {
+    fn nrows(&self) -> usize {
+        self.prepared.nrows()
+    }
+
+    fn compute_max_sim(&self, doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
+        let mut scratch = vec![f32::MIN; self.prepared.padded_nrows()];
+        let prepared_ref: BlockTransposedRef<'_, f32, 8> = self.prepared.as_view();
+        let ca = <BlockTransposedRef<'_, f32, 8> as layouts::DescribeLayout>::layout(&prepared_ref);
+        let cb = <MatRef<'_, Standard<f32>> as layouts::DescribeLayout>::layout(&doc);
+
+        // SAFETY: prepared.as_ptr() covers padded_nrows * ncols block-transposed
+        // f32s; doc.as_slice() covers num_vectors * vector_dim row-major f32s;
+        // scratch length == padded_nrows; padded_nrows is a multiple of
+        // A_PANEL=8 by BlockTransposed construction.
+        unsafe {
+            tiled_reduce::<Scalar, TemplateKernel, _, _>(
+                self.arch,
+                &ca,
+                &cb,
+                self.prepared.as_ptr(),
+                self.prepared.padded_nrows(),
+                doc.as_slice().as_ptr(),
+                doc.num_vectors(),
+                doc.vector_dim(),
+                &mut scratch,
+                TileBudget::default(),
+            );
+        }
+
+        for (dst, &src) in scores.iter_mut().zip(&scratch[..self.prepared.nrows()]) {
+            *dst = -src;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    //! Miri-friendly correctness test for the template kernel.
+    //!
+    //! Validates that the template's adapter machinery produces the same
+    //! per-row scores as `QueryComputer::from_arch(Scalar)`. Iteration
+    //! count is reduced under Miri so `cargo +nightly miri test` finishes
+    //! in seconds, not minutes.
+    use super::*;
+    use diskann_quantization::multi_vector::QueryComputer;
+
+    fn make_data(nrows: usize, ncols: usize, shift: usize) -> Vec<f32> {
+        (0..nrows * ncols)
+            .map(|v| ((v + shift) % ncols) as f32)
+            .collect()
+    }
+
+    #[test]
+    fn template_matches_pinned_scalar() {
+        let cases: &[(usize, usize, usize)] = if cfg!(miri) {
+            // Single small case under Miri to keep runtime reasonable.
+            &[(3, 4, 8)]
+        } else {
+            &[(1, 1, 4), (3, 5, 8), (8, 4, 16), (10, 6, 32)]
+        };
+
+        for &(nq, nd, dim) in cases {
+            let qd = make_data(nq, dim, dim / 2);
+            let dd = make_data(nd, dim, dim);
+            let query = MatRef::new(Standard::<f32>::new(nq, dim).unwrap(), &qd).unwrap();
+            let doc = MatRef::new(Standard::<f32>::new(nd, dim).unwrap(), &dd).unwrap();
+
+            let pinned = QueryComputer::<f32>::from_arch(query, Scalar::new());
+            let template = QueryComputer::<f32>::from_dyn(Box::new(TemplateComputer::new(query)));
+
+            let mut pinned_scores = vec![0.0f32; nq];
+            let mut template_scores = vec![0.0f32; nq];
+            pinned.max_sim(doc, &mut pinned_scores);
+            template.max_sim(doc, &mut template_scores);
+
+            for (i, (p, t)) in pinned_scores.iter().zip(template_scores.iter()).enumerate() {
+                assert!(
+                    (p - t).abs() < 1e-10,
+                    "shape ({nq},{nd},{dim}) row {i}: pinned={p} template={t}",
+                );
+            }
+        }
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/library_kernels.rs b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs
new file mode 100644
index 000000000..56d6c0db1
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/library_kernels.rs
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Library kernel registrations and arch-dispatch machinery.
+//!
+//! Mirrors the structure of `diskann-benchmark-simd`: a `Kernel<A, T>`
+//! PhantomData carrier carries the (arch × element type) pair through the
+//! benchmark registry, [`DispatchRule<Arch>`] maps the JSON-facing `Arch`
+//! enum to a concrete arch token, and the `stamp!` / `match_arch!` macros
+//! generate the repetitive `RunBenchmark<A>` / `DispatchRule` impls.
+//!
+//! Library kernels registered here:
+//! - `multi-vector-op-{f32,f16}-auto` — `QueryComputer::new` (auto-dispatch)
+//! - `multi-vector-op-{f32,f16}-scalar` — `from_arch(Scalar)`
+//! - `multi-vector-op-{f32,f16}-x86_64_V3` — `from_arch(V3)` (x86_64 only)
+//! - `multi-vector-op-{f32,f16}-x86_64_V4` — `from_arch(V4)` (x86_64 only)
+//! - `multi-vector-op-{f32,f16}-aarch64_neon` — `from_arch(Neon)` (aarch64 only)
+//! - `multi-vector-op-{f32,f16}-reference` — `MaxSim` fallback
+
+use std::io::Write;
+use std::marker::PhantomData;
+
+use diskann_benchmark_runner::{
+    benchmark::{PassFail, Regression},
+    dispatcher::{Description, DispatchRule, FailureScore, MatchScore},
+    utils::{datatype, num::relative_change},
+    Benchmark, Checkpoint, Output,
+};
+use diskann_quantization::multi_vector::{MatRef, QueryComputer, Standard};
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::PureDistanceFunction;
+#[cfg(target_arch = "aarch64")]
+use diskann_wide::arch::aarch64::Neon;
+#[cfg(target_arch = "x86_64")]
+use diskann_wide::arch::x86_64::{V3, V4};
+use diskann_wide::arch::Scalar;
+use diskann_wide::Architecture;
+use rand::distr::{Distribution, StandardUniform};
+
+use super::driver::{
+    run_with_distance, CheckResult, Comparison, Data, DisplayWrapper, OptimizedDistance,
+    ReferenceDistance, RunResult,
+};
+use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance};
+
+/// PhantomData carrier for one (arch, element-type) entry in the benchmark
+/// registry. The arch parameter `A` is either a real arch token (`Scalar`,
+/// `V3`, `V4`, `Neon`) or one of the marker types [`Auto`] / [`Reference`].
+pub(super) struct Kernel<A, T> {
+    _type: PhantomData<(A, T)>,
+}
+
+impl<A, T> Kernel<A, T> {
+    pub(super) fn new() -> Self {
+        Self { _type: PhantomData }
+    }
+}
+
+/// Marker for the auto-dispatched (CPU-detected) kernel — `QueryComputer::new`.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct Auto;
+
+/// Marker for the reference (`MaxSim` fallback) kernel.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct Reference;
+
+/// Wrapper around an arch token (real or marker) that implements
+/// [`DispatchRule<Arch>`] for the JSON-facing [`Arch`] enum.
+pub(super) struct Identity<A>(pub(super) A);
+
+/// Returned by `Identity::<A>::convert` when the host CPU doesn't support the
+/// requested ISA. The dispatcher converts this into a friendly error message.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct ArchNotSupported(pub(super) Arch);
+
+impl std::fmt::Display for ArchNotSupported {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} not supported on this CPU", self.0)
+    }
+}
+
+impl std::error::Error for ArchNotSupported {}
+
+//////////////////////
+// Dispatch rules   //
+//////////////////////
+
+/// Generates a [`DispatchRule<Arch>`] for a real arch token. `try_match` returns:
+/// - `Ok(MatchScore(0))` when the input names this arch AND the host CPU supports it
+/// - `Err(FailureScore(0))` when the input names this arch but the CPU doesn't support it
+///   (this surfaces in the dispatcher's near-miss diagnostic)
+/// - `Err(FailureScore(1))` when the input names a different arch
+macro_rules! match_arch_x86_64 {
+    ($arch:path, $enum:ident) => {
+        #[cfg(target_arch = "x86_64")]
+        impl DispatchRule<Arch> for Identity<$arch> {
+            type Error = ArchNotSupported;
+            fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+                if *from != Arch::$enum {
+                    return Err(FailureScore(1));
+                }
+                if <$arch>::new_checked().is_some() {
+                    Ok(MatchScore(0))
+                } else {
+                    Err(FailureScore(0))
+                }
+            }
+            fn convert(from: Arch) -> Result<Self, Self::Error> {
+                <$arch>::new_checked()
+                    .ok_or(ArchNotSupported(from))
+                    .map(Identity)
+            }
+        }
+    };
+}
+
+match_arch_x86_64!(V3, X86_64_V3);
+match_arch_x86_64!(V4, X86_64_V4);
+
+#[cfg(target_arch = "aarch64")]
+impl DispatchRule<Arch> for Identity<Neon> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from != Arch::Neon {
+            return Err(FailureScore(1));
+        }
+        if Neon::new_checked().is_some() {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(0))
+        }
+    }
+    fn convert(from: Arch) -> Result<Self, Self::Error> {
+        Neon::new_checked()
+            .ok_or(ArchNotSupported(from))
+            .map(Identity)
+    }
+}
+
+// Scalar is always available; no CPU check needed.
+impl DispatchRule<Arch> for Identity<Scalar> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from == Arch::Scalar {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+    fn convert(_from: Arch) -> Result<Self, Self::Error> {
+        Ok(Identity(Scalar::new()))
+    }
+}
+
+impl DispatchRule<Arch> for Identity<Auto> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from == Arch::Auto {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+    fn convert(_from: Arch) -> Result<Self, Self::Error> {
+        Ok(Identity(Auto))
+    }
+}
+
+impl DispatchRule<Arch> for Identity<Reference> {
+    type Error = ArchNotSupported;
+    fn try_match(from: &Arch) -> Result<MatchScore, FailureScore> {
+        if *from == Arch::Reference {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+    fn convert(_from: Arch) -> Result<Self, Self::Error> {
+        Ok(Identity(Reference))
+    }
+}
+
+//////////////////////
+// Benchmark trait  //
+//////////////////////
+
+/// Per-arch run trait. The `stamp!` macro generates impls for real arch tokens;
+/// `Auto` and `Reference` get hand-written impls.
+pub(super) trait RunBenchmark<A> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>;
+}
+
+impl<A, T> Benchmark for Kernel<A, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    Identity<A>: DispatchRule<Arch, Error = ArchNotSupported>,
+    Kernel<A, T>: RunBenchmark<A>,
+    A: 'static,
+    T: 'static,
+{
+    type Input = MultiVectorOp;
+    type Output = Vec<RunResult>;
+
+    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+        let mut failscore: Option<u32> = None;
+        if datatype::Type::<T>::try_match(&from.element_type).is_err() {
+            *failscore.get_or_insert(0) += 10;
+        }
+        match Identity::<A>::try_match(&from.arch) {
+            Ok(MatchScore(_)) => (),
+            Err(FailureScore(score)) => {
+                *failscore.get_or_insert(0) += score;
+            }
+        }
+        match failscore {
+            None => Ok(MatchScore(0)),
+            Some(score) => Err(FailureScore(score)),
+        }
+    }
+
+    fn run(
+        &self,
+        input: &MultiVectorOp,
+        _: Checkpoint<'_>,
+        mut output: &mut dyn Output,
+    ) -> anyhow::Result<Self::Output> {
+        writeln!(output, "{}", input)?;
+        let results = self.run_benchmark(input)?;
+        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+        Ok(results)
+    }
+
+    fn description(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        input: Option<&MultiVectorOp>,
+    ) -> std::fmt::Result {
+        match input {
+            None => {
+                writeln!(
+                    f,
+                    "- Element Type: {}",
+                    Description::<datatype::DataType, datatype::Type<T>>::new()
+                )?;
+                writeln!(f, "- Arch: {}", Description::<Arch, Identity<A>>::new())?;
+            }
+            Some(input) => {
+                if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
+                    writeln!(f, "\n    - Mismatched element type: {}", err)?;
+                }
+                if Identity::<A>::try_match(&input.arch).is_err() {
+                    writeln!(f, "\n    - Wrong or unsupported arch: {}", input.arch)?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<A, T> Regression for Kernel<A, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    Identity<A>: DispatchRule<Arch, Error = ArchNotSupported>,
+    Kernel<A, T>: RunBenchmark<A>,
+    A: 'static,
+    T: 'static,
+{
+    type Tolerances = MultiVectorTolerance;
+    type Pass = CheckResult;
+    type Fail = CheckResult;
+
+    fn check(
+        &self,
+        tolerance: &MultiVectorTolerance,
+        _input: &MultiVectorOp,
+        before: &Vec<RunResult>,
+        after: &Vec<RunResult>,
+    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+        anyhow::ensure!(
+            before.len() == after.len(),
+            "before has {} runs but after has {}",
+            before.len(),
+            after.len(),
+        );
+
+        let mut passed = true;
+        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+            .enumerate()
+            .map(|(i, (b, a))| {
+                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                let computations_per_latency = b.computations_per_latency() as f64;
+                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                let comparison = Comparison {
+                    run: b.run.clone(),
+                    tolerance: *tolerance,
+                    before_min,
+                    after_min,
+                };
+
+                match relative_change(before_min, after_min) {
+                    Ok(change) => {
+                        if change > tolerance.min_time_regression.get() {
+                            passed = false;
+                        }
+                    }
+                    Err(_) => passed = false,
+                };
+
+                Ok(comparison)
+            })
+            .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+        let check = CheckResult { checks };
+        Ok(if passed {
+            PassFail::Pass(check)
+        } else {
+            PassFail::Fail(check)
+        })
+    }
+}
+
+//////////////////////
+// RunBenchmark impls
+//////////////////////
+
+/// Element-type-erasing constructor for [`QueryComputer<T>`]. `QueryComputer`'s
+/// `new` / `from_arch` are inherent methods on the concrete `QueryComputer<f32>`
+/// and `QueryComputer<half::f16>` types, so generic code needs this shim.
+pub(super) trait BuildArchQc<T: Copy> {
+    /// Build a `QueryComputer<T>` pinned to the host's auto-dispatched arch.
+    fn build_auto(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
+}
+
+impl BuildArchQc<f32> for f32 {
+    fn build_auto(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
+        QueryComputer::<f32>::new(query)
+    }
+}
+
+impl BuildArchQc<half::f16> for half::f16 {
+    fn build_auto(query: MatRef<'_, Standard<half::f16>>) -> QueryComputer<half::f16> {
+        QueryComputer::<half::f16>::new(query)
+    }
+}
+
+/// Per-(arch, T) constructor for `QueryComputer::from_arch`. Same idea as
+/// [`BuildArchQc::build_auto`] but pinned to a specific arch token.
+pub(super) trait BuildPinnedQc<A: Architecture, T: Copy> {
+    fn build_pinned(query: MatRef<'_, Standard<T>>, arch: A) -> QueryComputer<T>;
+}
+
+macro_rules! impl_build_pinned {
+    ($arch:path, $T:ty) => {
+        impl BuildPinnedQc<$arch, $T> for $T {
+            fn build_pinned(query: MatRef<'_, Standard<$T>>, arch: $arch) -> QueryComputer<$T> {
+                QueryComputer::<$T>::from_arch(query, arch)
+            }
+        }
+    };
+}
+
+impl_build_pinned!(Scalar, f32);
+impl_build_pinned!(Scalar, half::f16);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V3, f32);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V3, half::f16);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V4, f32);
+#[cfg(target_arch = "x86_64")]
+impl_build_pinned!(V4, half::f16);
+#[cfg(target_arch = "aarch64")]
+impl_build_pinned!(Neon, f32);
+#[cfg(target_arch = "aarch64")]
+impl_build_pinned!(Neon, half::f16);
+
+/// Stamp out `RunBenchmark<$arch>` for `Kernel<$arch, $T>` using
+/// `QueryComputer::<T>::from_arch($arch_token)`.
+macro_rules! stamp {
+    ($arch:path, $T:ty) => {
+        impl RunBenchmark<$arch> for Kernel<$arch, $T>
+        where
+            StandardUniform: Distribution<$T>,
+            $T: BuildPinnedQc<$arch, $T>,
+        {
+            fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+                let arch = Identity::<$arch>::convert(input.arch)?.0;
+                let mut results = Vec::with_capacity(input.runs.len());
+                for run in input.runs.iter() {
+                    let data = Data::<$T>::new(run);
+                    // `QueryComputer` performs query-side precomputation that is
+                    // intentionally amortized across many `max_sim` calls;
+                    // construct it once per shape, outside the timed loop.
+                    let qc = <$T as BuildPinnedQc<$arch, $T>>::build_pinned(
+                        data.queries.as_view(),
+                        arch,
+                    );
+                    let dist = OptimizedDistance(qc);
+                    results.push(run_with_distance(run, data.docs.as_view(), &dist));
+                }
+                Ok(results)
+            }
+        }
+    };
+    ($target_arch:literal, $arch:path, $T:ty) => {
+        #[cfg(target_arch = $target_arch)]
+        stamp!($arch, $T);
+    };
+}
+
+stamp!(Scalar, f32);
+stamp!(Scalar, half::f16);
+stamp!("x86_64", V3, f32);
+stamp!("x86_64", V3, half::f16);
+stamp!("x86_64", V4, f32);
+stamp!("x86_64", V4, half::f16);
+stamp!("aarch64", Neon, f32);
+stamp!("aarch64", Neon, half::f16);
+
+// Auto and Reference get hand-written impls (different construction paths).
+
+impl<T> RunBenchmark<Auto> for Kernel<Auto, T>
+where
+    T: Copy + 'static + BuildArchQc<T>,
+    StandardUniform: Distribution<T>,
+{
+    fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let qc = <T as BuildArchQc<T>>::build_auto(data.queries.as_view());
+            let dist = OptimizedDistance(qc);
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+}
+
+impl<T> RunBenchmark<Reference> for Kernel<Reference, T>
+where
+    T: Copy + 'static,
+    StandardUniform: Distribution<T>,
+    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+    for<'a> ReferenceDistance<'a, T>: super::driver::Distance<T>,
+{
+    fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let dist = ReferenceDistance(data.queries.as_view().into());
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+}
+
+//////////////////////
+// Registration     //
+//////////////////////
+
+pub(super) fn register(benchmarks: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    benchmarks.register_regression("multi-vector-op-f32-auto", Kernel::<Auto, f32>::new());
+    benchmarks.register_regression("multi-vector-op-f16-auto", Kernel::<Auto, half::f16>::new());
+
+    benchmarks.register_regression("multi-vector-op-f32-scalar", Kernel::<Scalar, f32>::new());
+    benchmarks.register_regression(
+        "multi-vector-op-f16-scalar",
+        Kernel::<Scalar, half::f16>::new(),
+    );
+
+    benchmarks.register_regression(
+        "multi-vector-op-f32-reference",
+        Kernel::<Reference, f32>::new(),
+    );
+    benchmarks.register_regression(
+        "multi-vector-op-f16-reference",
+        Kernel::<Reference, half::f16>::new(),
+    );
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        benchmarks.register_regression("multi-vector-op-f32-x86_64_V3", Kernel::<V3, f32>::new());
+        benchmarks.register_regression(
+            "multi-vector-op-f16-x86_64_V3",
+            Kernel::<V3, half::f16>::new(),
+        );
+        benchmarks.register_regression("multi-vector-op-f32-x86_64_V4", Kernel::<V4, f32>::new());
+        benchmarks.register_regression(
+            "multi-vector-op-f16-x86_64_V4",
+            Kernel::<V4, half::f16>::new(),
+        );
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        benchmarks.register_regression(
+            "multi-vector-op-f32-aarch64_neon",
+            Kernel::<Neon, f32>::new(),
+        );
+        benchmarks.register_regression(
+            "multi-vector-op-f16-aarch64_neon",
+            Kernel::<Neon, half::f16>::new(),
+        );
+    }
+}
diff --git a/diskann-benchmark/src/backend/multi_vector/mod.rs b/diskann-benchmark/src/backend/multi_vector/mod.rs
new file mode 100644
index 000000000..90426e571
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector/mod.rs
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector MaxSim distance benchmarks with regression detection.
+//!
+//! This module is a **kernel-research substrate**, not just a benchmark. It
+//! supports two distinct use cases:
+//!
+//! 1. **Head-to-head ISA (instruction set architecture) comparison.** Library
+//!    kernels are registered per arch (`scalar`, `x86-64-v3`, `x86-64-v4`,
+//!    `aarch64-neon`) plus `auto` (CPU-detected) and `reference` (fallback).
+//!    Pinning to a specific ISA lets you compare e.g. AVX2 vs AVX512 on the
+//!    same AVX512 host.
+//!
+//! 2. **Experimental kernel authoring.** External crates and the
+//!    `experimental/` submodule can author new SIMD micro-kernels by
+//!    implementing the public `Kernel<A>` trait in
+//!    `diskann-quantization::multi_vector::distance::kernels`, plug them
+//!    into the existing cache-aware tile orchestrator (`tiled_reduce`),
+//!    and slot them into the benchmark via
+//!    `QueryComputer::from_dyn(Box::new(...))`.
+//!
+//! # Adding a new experimental kernel
+//!
+//! See `experimental/template.rs` for the full step-by-step workflow with
+//! a worked example. Summary:
+//!
+//! 1. Add a variant to [`crate::inputs::multi_vector::Arch`].
+//! 2. Implement `Kernel<A>` for your micro-kernel.
+//! 3. Implement `DynQueryComputer<T>` for your adapter, calling
+//!    `tiled_reduce` with your kernel.
+//! 4. Add a marker type + `DispatchRule<Arch>` impl so the new variant
+//!    routes to your kernel.
+//! 5. Add a `RunBenchmark<Marker>` impl + `register_regression(...)` call
+//!    in `experimental::register`.
+//!
+//! **Validate experimental kernels under Miri:**
+//! - Construct arch tokens via `Scalar::new()` (Miri-safe) or
+//!   `V4::new_checked_miri()` (Miri-safe AVX-512 emulation). `V3::new_checked()`
+//!   and `Neon::new_checked()` don't have `_miri` variants today; if you need
+//!   them under Miri, follow `V4::new_checked_miri()`'s pattern.
+//! - Gate Miri-unsupported intrinsics with `#[cfg(not(miri))]`.
+//! - Reduce test-sweep size under `cfg(miri)` to keep runtimes reasonable.
+
+use diskann_benchmark_runner::registry::Benchmarks;
+
+cfg_if::cfg_if! {
+    if #[cfg(feature = "multi-vector")] {
+        mod driver;
+        mod experimental;
+        mod library_kernels;
+
+        pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
+            library_kernels::register(benchmarks);
+            experimental::register(benchmarks);
+        }
+    } else {
+        crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
+
+        pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
+            imp::register("multi-vector-op", benchmarks);
+        }
+    }
+}
+
+#[cfg(all(test, feature = "multi-vector"))]
+mod tests {
+    use std::num::NonZeroUsize;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        utils::{
+            datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles,
+            MicroSeconds,
+        },
+    };
+
+    use super::driver::{CheckResult, Comparison, RunResult};
+    use super::library_kernels::{Auto, Kernel};
+    use crate::inputs::multi_vector::{Arch, MultiVectorOp, MultiVectorTolerance, Run};
+
+    fn tiny_run() -> Run {
+        Run {
+            num_query_vectors: NonZeroUsize::new(2).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+            dim: NonZeroUsize::new(4).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    fn tiny_op() -> MultiVectorOp {
+        MultiVectorOp {
+            element_type: DataType::Float32,
+            arch: Arch::Auto,
+            runs: vec![tiny_run()],
+        }
+    }
+
+    fn tiny_result(minimum: u64) -> RunResult {
+        let mut latencies = vec![MicroSeconds::new(minimum)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run: tiny_run(),
+            latencies,
+            percentiles,
+        }
+    }
+
+    fn tolerance(limit: f64) -> MultiVectorTolerance {
+        MultiVectorTolerance {
+            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+        }
+    }
+
+    #[test]
+    fn check_rejects_mismatched_runs() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        // Build a result whose `run` diverges from `tiny_run()` so the
+        // regression check's `b.run == a.run` invariant fires.
+        let mut latencies = vec![MicroSeconds::new(100)];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        let mismatched_result = RunResult {
+            run: Run {
+                num_query_vectors: NonZeroUsize::new(4).unwrap(),
+                ..tiny_run()
+            },
+            latencies,
+            percentiles,
+        };
+
+        let err = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![mismatched_result],
+            )
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "run 0 mismatched");
+    }
+
+    #[test]
+    fn check_allows_negative_relative_change() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(95)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_passes_on_tolerance_boundary() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(105)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_fails_above_tolerance_boundary() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(100)],
+                &vec![tiny_result(106)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    #[test]
+    fn check_result_display_includes_failure_details() {
+        let check = CheckResult {
+            checks: vec![Comparison {
+                run: tiny_run(),
+                tolerance: tolerance(0.05),
+                before_min: 100.0,
+                after_min: 106.0,
+            }],
+        };
+
+        let rendered = check.to_string();
+        assert!(rendered.contains("Q"), "rendered = {rendered}");
+        assert!(rendered.contains("Dim"), "rendered = {rendered}");
+        assert!(rendered.contains("100.000"), "rendered = {rendered}");
+        assert!(rendered.contains("106.000"), "rendered = {rendered}");
+        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+    }
+
+    /// A "before" value of 0 means the measurement was too fast to obtain a
+    /// reliable signal, so we *could* be letting a regression through. We
+    /// require at least a non-zero value.
+    #[test]
+    fn zero_values_rejected() {
+        let kernel = Kernel::<Auto, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(0)],
+                &vec![tiny_result(0)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+}
diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs
index 856412e2a..414a0b52e 100644
--- a/diskann-benchmark/src/inputs/mod.rs
+++ b/diskann-benchmark/src/inputs/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod disk;
 pub(crate) mod exhaustive;
 pub(crate) mod filters;
 pub(crate) mod graph_index;
+pub(crate) mod multi_vector;
 pub(crate) mod save_and_load;
 
 pub(crate) fn register_inputs(
@@ -16,6 +17,7 @@ pub(crate) fn register_inputs(
     exhaustive::register_inputs(registry)?;
     disk::register_inputs(registry)?;
     filters::register_inputs(registry)?;
+    multi_vector::register_inputs(registry)?;
     Ok(())
 }
 
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
new file mode 100644
index 000000000..150d72a8e
--- /dev/null
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use std::num::NonZeroUsize;
+
+use diskann_benchmark_runner::{
+    utils::{datatype::DataType, num::NonNegativeFinite},
+    CheckDeserialization, Checker,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::{as_input, Example};
+
+//////////////
+// Registry //
+//////////////
+
+as_input!(MultiVectorOp);
+as_input!(MultiVectorTolerance);
+
+pub(super) fn register_inputs(
+    registry: &mut diskann_benchmark_runner::registry::Inputs,
+) -> anyhow::Result<()> {
+    registry.register::<MultiVectorOp>()?;
+    registry.register::<MultiVectorTolerance>()?;
+    Ok(())
+}
+
+////////////////
+// Enum types //
+////////////////
+
+/// Which kernel to benchmark.
+///
+/// Mirrors `diskann-benchmark-simd`'s `Arch` enum: kebab-case serialization,
+/// one variant per supported ISA plus `Reference` (fallback) and `Auto`
+/// (host-portable). Marked `#[non_exhaustive]` so experimental kernels can
+/// add variants without breaking JSON configs.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+#[non_exhaustive]
+pub(crate) enum Arch {
+    #[serde(rename = "x86-64-v4")]
+    #[allow(non_camel_case_types)]
+    X86_64_V4,
+    #[serde(rename = "x86-64-v3")]
+    #[allow(non_camel_case_types)]
+    X86_64_V3,
+    Neon,
+    Scalar,
+    Reference,
+    /// Auto-dispatch to the host's best supported arch (calls `QueryComputer::new`).
+    Auto,
+}
+
+impl std::fmt::Display for Arch {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::X86_64_V4 => "x86-64-v4",
+            Self::X86_64_V3 => "x86-64-v3",
+            Self::Neon => "neon",
+            Self::Scalar => "scalar",
+            Self::Reference => "reference",
+            Self::Auto => "auto",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// One benchmark configuration: a single shape measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub(crate) struct Run {
+    pub(crate) num_query_vectors: NonZeroUsize,
+    pub(crate) num_doc_vectors: NonZeroUsize,
+    pub(crate) dim: NonZeroUsize,
+    pub(crate) loops_per_measurement: NonZeroUsize,
+    pub(crate) num_measurements: NonZeroUsize,
+}
+
+///////////////////////
+// Multi-Vector Op   //
+///////////////////////
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct MultiVectorOp {
+    pub(crate) element_type: DataType,
+    pub(crate) arch: Arch,
+    pub(crate) runs: Vec<Run>,
+}
+
+impl MultiVectorOp {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+}
+
+impl CheckDeserialization for MultiVectorOp {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorOp {
+    fn example() -> Self {
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                num_query_vectors: NonZeroUsize::new(32).unwrap(),
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                num_query_vectors: NonZeroUsize::new(64).unwrap(),
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Self {
+            element_type: DataType::Float32,
+            arch: Arch::Auto,
+            runs,
+        }
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "arch", self.arch)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
+
+/////////////////////////////
+// Multi-Vector Tolerance  //
+/////////////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub(crate) struct MultiVectorTolerance {
+    pub(crate) min_time_regression: NonNegativeFinite,
+}
+
+impl MultiVectorTolerance {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+}
+
+impl CheckDeserialization for MultiVectorTolerance {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorTolerance {
+    fn example() -> Self {
+        Self {
+            min_time_regression: NonNegativeFinite::new(0.05)
+                .expect("0.05 is a valid non-negative finite"),
+        }
+    }
+}
diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs
index a35d85427..5f641dd9f 100644
--- a/diskann-benchmark/src/main.rs
+++ b/diskann-benchmark/src/main.rs
@@ -776,6 +776,92 @@ mod tests {
         assert!(!output_path.exists());
     }
 
+    ///////////////////
+    // Multi-Vector  //
+    ///////////////////
+
+    #[test]
+    fn multi_vector_integration() {
+        let path = example_directory().join("multi-vector.json");
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+        assert!(!output_path.exists());
+
+        let modified_input_path = tempdir.path().join("input.json");
+
+        let mut raw = value_from_file(&path);
+        prefix_search_directories(&mut raw, &root_directory());
+        save_to_file(&modified_input_path, &raw);
+
+        run_multi_vector_integration(&modified_input_path, &output_path)
+    }
+
+    #[cfg(feature = "multi-vector")]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        // Check that the results file is generated.
+        assert!(output_path.exists());
+    }
+
+    #[cfg(not(feature = "multi-vector"))]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        let err = cli.run(&mut output).unwrap_err();
+        println!("err = {:?}", err);
+
+        let output = String::from_utf8(output.into_inner()).unwrap();
+        assert!(output.contains("\"multi-vector\" feature"));
+        println!("output = {}", output);
+
+        // The output file should not have been created because we failed the test.
+        assert!(!output_path.exists());
+    }
+
+    #[test]
+    #[cfg(feature = "multi-vector")]
+    fn multi_vector_check_verify() {
+        let input_path = example_directory().join("multi-vector.json");
+        let tolerance_path = project_directory()
+            .join("perf_test_inputs")
+            .join("multi-vector-tolerance.json");
+
+        let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify {
+            tolerances: tolerance_path,
+            input_file: input_path,
+        });
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+    }
+
     #[test]
     fn quiet_suppresses_check_target_warning() {
         let cli = Cli::from_commands(Commands::Skeleton, true);
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs
index a535c68dc..e6dc8a772 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f16.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f16.rs
@@ -34,7 +34,7 @@ impl<A, const GROUP: usize>
 where
     A: Architecture,
     F32Kernel<GROUP>: Kernel<A>,
-    layouts::BlockTransposed<half::f16, GROUP>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left>
+    layouts::BlockTransposedLayout<half::f16, GROUP>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left>
         + layouts::Layout<Element = half::f16>,
     layouts::RowMajor<half::f16>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right>
         + layouts::Layout<Element = half::f16>,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs
index a900ea356..602da6324 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/mod.rs
@@ -30,7 +30,7 @@ mod scalar;
 mod v3;
 
 /// Zero-sized kernel type for f32 micro-kernels with block size `GROUP`.
-pub(crate) struct F32Kernel<const GROUP: usize>;
+pub struct F32Kernel<const GROUP: usize>;
 
 #[inline(never)]
 #[cold]
@@ -66,7 +66,7 @@ pub(super) fn max_ip_kernel<A: Architecture, T: Copy, const GROUP: usize>(
     budget: TileBudget,
 ) where
     F32Kernel<GROUP>: Kernel<A>,
-    layouts::BlockTransposed<T, GROUP>:
+    layouts::BlockTransposedLayout<T, GROUP>:
         layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left> + layouts::Layout<Element = T>,
     layouts::RowMajor<T>: layouts::ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right>
         + layouts::Layout<Element = T>,
@@ -117,7 +117,7 @@ impl<A, const GROUP: usize>
 where
     A: Architecture,
     Self: Kernel<A>,
-    layouts::BlockTransposed<f32, GROUP>:
+    layouts::BlockTransposedLayout<f32, GROUP>:
         layouts::ConvertTo<A, <Self as Kernel<A>>::Left> + layouts::Layout<Element = f32>,
     layouts::RowMajor<f32>:
         layouts::ConvertTo<A, <Self as Kernel<A>>::Right> + layouts::Layout<Element = f32>,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs
index bd8fb1c4a..2a230ca62 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/scalar.rs
@@ -27,7 +27,7 @@ diskann_wide::alias!(f32s = <Scalar>::f32x8);
 // A_PANEL(8) * k A elements, UNROLL * k B elements, and A_PANEL(8)
 // scratch elements — all within the bounds guaranteed by `tiled_reduce`.
 unsafe impl Kernel<Scalar> for F32Kernel<8> {
-    type Left = layouts::BlockTransposed<f32, 8>;
+    type Left = layouts::BlockTransposedLayout<f32, 8>;
     type Right = layouts::RowMajor<f32>;
     const A_PANEL: usize = 8;
     const B_PANEL: usize = 2;
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs
index b05195b1e..319cf1cda 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/f32/v3.rs
@@ -17,7 +17,7 @@ diskann_wide::alias!(f32s = <V3>::f32x8);
 // A_PANEL(16) * k A elements, UNROLL * k B elements, and A_PANEL(16)
 // scratch elements — all within the bounds guaranteed by `tiled_reduce`.
 unsafe impl Kernel<V3> for F32Kernel<16> {
-    type Left = layouts::BlockTransposed<f32, 16>;
+    type Left = layouts::BlockTransposedLayout<f32, 16>;
     type Right = layouts::RowMajor<f32>;
     const A_PANEL: usize = 16;
     const B_PANEL: usize = 4;
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs
index e1ec8dd36..54962acaa 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/layouts.rs
@@ -4,7 +4,7 @@
 //! Layout markers and tile-level conversion traits.
 //!
 //! - [`Layout`] — marker trait: memory layout + element type.
-//! - [`BlockTransposed`] / [`RowMajor`] — zero-sized layout markers.
+//! - [`BlockTransposedLayout`] / [`RowMajor`] — zero-sized layout markers.
 //! - [`DescribeLayout`] — bridges matrix types to layout markers.
 //! - [`ConvertTo`] — tile-level conversion (blanket identity + f16→f32).
 
@@ -17,7 +17,7 @@ use diskann_wide::arch::Target2;
 // ── Layout trait ─────────────────────────────────────
 
 /// Memory layout and element type marker for tile data.
-pub(super) trait Layout {
+pub trait Layout {
     type Element: Copy;
 }
 
@@ -25,28 +25,36 @@ pub(super) trait Layout {
 
 /// Block-transposed tile layout: `GROUP` rows per block, `PACK` columns
 /// interleaved. Matches [`BlockTransposedRef`](crate::multi_vector::BlockTransposedRef).
-pub(super) struct BlockTransposed<T, const GROUP: usize, const PACK: usize = 1>(PhantomData<T>);
-
-impl<T, const GROUP: usize, const PACK: usize> BlockTransposed<T, GROUP, PACK> {
+///
+/// This is the zero-sized **layout marker** used in [`Kernel<A>::Left`] /
+/// [`Kernel<A>::Right`](super::Kernel) associated types. It is distinct
+/// from the owning storage type [`BlockTransposed`](crate::multi_vector::BlockTransposed)
+/// — the marker carries layout information at the type level; the owning
+/// type holds actual data.
+pub struct BlockTransposedLayout<T, const GROUP: usize, const PACK: usize = 1>(PhantomData<T>);
+
+impl<T, const GROUP: usize, const PACK: usize> BlockTransposedLayout<T, GROUP, PACK> {
     pub(super) fn new() -> Self {
         Self(PhantomData)
     }
 }
 
-impl<T, const GROUP: usize, const PACK: usize> Copy for BlockTransposed<T, GROUP, PACK> {}
+impl<T, const GROUP: usize, const PACK: usize> Copy for BlockTransposedLayout<T, GROUP, PACK> {}
 
-impl<T, const GROUP: usize, const PACK: usize> Clone for BlockTransposed<T, GROUP, PACK> {
+impl<T, const GROUP: usize, const PACK: usize> Clone for BlockTransposedLayout<T, GROUP, PACK> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<T: Copy, const GROUP: usize, const PACK: usize> Layout for BlockTransposed<T, GROUP, PACK> {
+impl<T: Copy, const GROUP: usize, const PACK: usize> Layout
+    for BlockTransposedLayout<T, GROUP, PACK>
+{
     type Element = T;
 }
 
 /// Dense row-major tile layout. Matches [`MatRef<Standard<T>>`](crate::multi_vector::MatRef).
-pub(super) struct RowMajor<T>(PhantomData<T>);
+pub struct RowMajor<T>(PhantomData<T>);
 
 impl<T> RowMajor<T> {
     pub(super) fn new() -> Self {
@@ -70,7 +78,7 @@ impl<T: Copy> Layout for RowMajor<T> {
 
 /// Bridges a concrete matrix type to its [`Layout`] marker, enabling
 /// type inference of [`ConvertTo`] parameters at call sites.
-pub(super) trait DescribeLayout {
+pub trait DescribeLayout {
     type Layout: Layout;
 
     fn layout(&self) -> Self::Layout;
@@ -79,10 +87,10 @@ pub(super) trait DescribeLayout {
 impl<T: Copy, const GROUP: usize, const PACK: usize> DescribeLayout
     for crate::multi_vector::BlockTransposedRef<'_, T, GROUP, PACK>
 {
-    type Layout = BlockTransposed<T, GROUP, PACK>;
+    type Layout = BlockTransposedLayout<T, GROUP, PACK>;
 
     fn layout(&self) -> Self::Layout {
-        BlockTransposed::new()
+        BlockTransposedLayout::new()
     }
 }
 
@@ -108,7 +116,7 @@ impl<T: Copy> DescribeLayout for crate::multi_vector::MatRef<'_, crate::multi_ve
 /// - `convert` reads at most `rows * k` source elements.
 /// - `convert` writes only within `buf`.
 /// - The returned pointer is valid until the next `&mut` access to `buf`.
-pub(super) unsafe trait ConvertTo<A: Architecture, To: Layout>: Layout {
+pub unsafe trait ConvertTo<A: Architecture, To: Layout>: Layout {
     /// Staging buffer for converted tile data (`()` for identity conversions).
     type Buffer;
 
@@ -162,7 +170,8 @@ unsafe impl<A: Architecture, L: Layout> ConvertTo<A, L> for L {
 // into `rows * k` f32 values in `buf`. The returned pointer is
 // `buf.as_ptr()`, valid until the next `&mut` access to `buf`.
 unsafe impl<A, const GROUP: usize, const PACK: usize>
-    ConvertTo<A, BlockTransposed<f32, GROUP, PACK>> for BlockTransposed<half::f16, GROUP, PACK>
+    ConvertTo<A, BlockTransposedLayout<f32, GROUP, PACK>>
+    for BlockTransposedLayout<half::f16, GROUP, PACK>
 where
     A: Architecture,
     SliceCast<f32, half::f16>: for<'a> Target2<A, (), &'a mut [f32], &'a [half::f16]>,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
index bd9121a24..e7fbd16ed 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/mod.rs
@@ -12,27 +12,45 @@
 //! - **Query**: Block-transposed (`GROUP` vectors per block, dimensions contiguous
 //!   within each block). The block size is determined by the kernel's `A_PANEL`.
 //! - **Document**: Row-major (standard [`MatRef`](crate::multi_vector::MatRef) format).
+//!
+//! The [`Kernel<A>`] trait + [`tiled_reduce`] + [`layouts`] are public so
+//! external crates can plug new micro-kernels into the existing orchestrator.
 
 pub(super) mod f16;
 pub(super) mod f32;
-mod layouts;
+pub mod layouts;
 mod reduce;
 mod tiled_reduce;
 
+pub use tiled_reduce::tiled_reduce;
+
 // ── Tile budget ──────────────────────────────────────────────────
 
 /// Cache budgets fed to the tile planner.
 ///
 /// `Default` returns the production budgets derived from hardcoded L1/L2
-/// cache-size estimates and fixed fractions.
+/// cache-size estimates and fixed fractions. Researchers benchmarking with
+/// non-default cache assumptions can construct a custom [`TileBudget`] via
+/// [`TileBudget::new`] and pass it to [`tiled_reduce`].
 #[derive(Debug, Clone, Copy)]
-struct TileBudget {
+pub struct TileBudget {
     /// L2 budget in bytes reserved for A tiles.
     l2_a: usize,
     /// L1 budget in bytes reserved for B tiles (before A-panel subtraction).
     l1_b: usize,
 }
 
+impl TileBudget {
+    /// Construct a [`TileBudget`] with explicit L2 and L1 byte budgets.
+    ///
+    /// `l2_a` is the budget the tile planner uses to size A tiles; `l1_b`
+    /// is the budget for B tiles (one A micro-panel is subtracted at
+    /// runtime since both must coexist in L1 during the inner loop).
+    pub fn new(l2_a: usize, l1_b: usize) -> Self {
+        Self { l2_a, l1_b }
+    }
+}
+
 impl Default for TileBudget {
     // TODO: Replace hardcoded fallbacks with detected cache sizes
     // (e.g. via `diskann_platform`, env-var override, or runtime query).
@@ -51,7 +69,7 @@ impl Default for TileBudget {
 
 // ── Kernel trait ─────────────────────────────────────────────────
 
-/// SIMD micro-kernel for the [`tiled_reduce`](tiled_reduce::tiled_reduce) loop.
+/// SIMD micro-kernel for the [`tiled_reduce`] loop.
 ///
 /// The kernel only sees already-converted data: storage-layout to
 /// kernel-layout conversion is handled at tile boundaries by
@@ -59,17 +77,32 @@ impl Default for TileBudget {
 /// pointers reference `<Self::Left as Layout>::Element` /
 /// `<Self::Right as Layout>::Element` directly.
 ///
+/// # Invariant
+///
+/// When pairing this kernel with the owning storage type
+/// [`BlockTransposed<T, GROUP>`](crate::multi_vector::BlockTransposed) via
+/// [`tiled_reduce`], the storage's `GROUP` const must equal this kernel's
+/// [`A_PANEL`](Self::A_PANEL). The library's own f32 / f16 paths enforce
+/// this with a `const { assert!(...) }` in `max_ip_kernel`; external
+/// implementors must uphold it manually.
+///
 /// # Safety
 ///
 /// Implementors must respect the per-method `# Safety` contracts on
 /// [`full_panel`](Self::full_panel) and [`partial_panel`](Self::partial_panel).
-unsafe trait Kernel<A: diskann_wide::Architecture> {
+/// Implementations should be validated under Miri: construct arch tokens
+/// via `*::new_checked_miri()` in tests and gate Miri-unsupported
+/// intrinsics with `#[cfg(not(miri))]`.
+pub unsafe trait Kernel<A: diskann_wide::Architecture> {
     /// Layout consumed by the A (left / query) side of the micro-kernel.
     type Left: layouts::Layout;
     /// Layout consumed by the B (right / document) side of the micro-kernel.
     type Right: layouts::Layout;
 
     /// Number of A rows processed per micro-kernel invocation.
+    ///
+    /// Callers of [`tiled_reduce`] must guarantee
+    /// `a_padded_nrows % A_PANEL == 0`.
     const A_PANEL: usize;
     /// Number of B rows processed per micro-kernel invocation.
     const B_PANEL: usize;
@@ -79,10 +112,13 @@ unsafe trait Kernel<A: diskann_wide::Architecture> {
     /// # Safety
     ///
     /// * `a` must point to `A_PANEL * k` contiguous elements of
-    ///   `<Self::Left as Layout>::Element`.
+    ///   `<Self::Left as Layout>::Element`, properly aligned for that layout.
     /// * `b` must point to `B_PANEL * k` contiguous elements of
-    ///   `<Self::Right as Layout>::Element`.
+    ///   `<Self::Right as Layout>::Element`, properly aligned for that layout.
     /// * `r` must point to at least `A_PANEL` writable `f32` values.
+    /// * `k > 0`.
+    /// * The caller must invoke this from within an `arch.run3` (or
+    ///   equivalent) so that target_feature is active for the entire body.
     unsafe fn full_panel(
         arch: A,
         a: *const <Self::Left as layouts::Layout>::Element,
@@ -95,11 +131,8 @@ unsafe trait Kernel<A: diskann_wide::Architecture> {
     ///
     /// # Safety
     ///
-    /// * `a` must point to `A_PANEL * k` contiguous elements of
-    ///   `<Self::Left as Layout>::Element`.
-    /// * `b` must point to `remainder * k` contiguous elements of
-    ///   `<Self::Right as Layout>::Element`.
-    /// * `r` must point to at least `A_PANEL` writable `f32` values.
+    /// Same as [`full_panel`](Self::full_panel) except `b` points to
+    /// `remainder * k` contiguous elements and `1 <= remainder < B_PANEL`.
     unsafe fn partial_panel(
         arch: A,
         remainder: usize,
diff --git a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs
index ff873c01f..285d823b6 100644
--- a/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs
+++ b/diskann-quantization/src/multi_vector/distance/kernels/tiled_reduce.rs
@@ -89,7 +89,7 @@ impl FullReduce {
 /// * `b_ptr` must be valid for `b_nrows * k` elements of `BElem`.
 /// * `scratch` must have length ≥ `a_padded_nrows` and be initialized by caller.
 #[allow(clippy::too_many_arguments)]
-pub(super) unsafe fn tiled_reduce<A, K, LA, LB>(
+pub unsafe fn tiled_reduce<A, K, LA, LB>(
     arch: A,
     ca: &LA,
     cb: &LB,
@@ -343,7 +343,7 @@ mod tests {
         let b = vec![0.0f32; 2 * k];
         let mut scratch = vec![f32::MIN; 16];
 
-        let ca = layouts::BlockTransposed::<f32, 8>::new();
+        let ca = layouts::BlockTransposedLayout::<f32, 8>::new();
         let cb = layouts::RowMajor::<f32>::new();
 
         // SAFETY: pointers and scratch are correctly sized; we expect a panic.
@@ -373,7 +373,7 @@ mod tests {
         let b = Vec::<f32>::new();
         let mut scratch = vec![f32::MIN; a_rows];
 
-        let ca = layouts::BlockTransposed::<f32, 8>::new();
+        let ca = layouts::BlockTransposedLayout::<f32, 8>::new();
         let cb = layouts::RowMajor::<f32>::new();
 
         // SAFETY: k == 0 so no elements are read; pointers are never dereferenced.
@@ -402,7 +402,7 @@ mod tests {
         let a_rows = 8;
         let mut scratch = vec![f32::MIN; a_rows];
 
-        let ca = layouts::BlockTransposed::<f32, 8>::new();
+        let ca = layouts::BlockTransposedLayout::<f32, 8>::new();
         let cb = layouts::RowMajor::<f32>::new();
 
         // SAFETY: k == 0, b_nrows == 0; no elements read.
@@ -516,7 +516,7 @@ mod tests {
         A: Architecture,
         T: Copy + Default,
         F32Kernel<GROUP>: Kernel<A>,
-        layouts::BlockTransposed<T, GROUP>:
+        layouts::BlockTransposedLayout<T, GROUP>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left> + Layout<Element = T>,
         layouts::RowMajor<T>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right> + Layout<Element = T>,
@@ -698,7 +698,7 @@ mod tests {
         A: Architecture,
         T: Copy + Default,
         F32Kernel<GROUP>: Kernel<A>,
-        layouts::BlockTransposed<T, GROUP>:
+        layouts::BlockTransposedLayout<T, GROUP>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Left> + Layout<Element = T>,
         layouts::RowMajor<T>:
             ConvertTo<A, <F32Kernel<GROUP> as Kernel<A>>::Right> + Layout<Element = T>,
diff --git a/diskann-quantization/src/multi_vector/distance/mod.rs b/diskann-quantization/src/multi_vector/distance/mod.rs
index 853f60753..354a47afe 100644
--- a/diskann-quantization/src/multi_vector/distance/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/mod.rs
@@ -50,10 +50,10 @@
 //! ```
 
 mod fallback;
-mod kernels;
+pub mod kernels;
 mod max_sim;
 mod query_computer;
 
 pub use fallback::QueryMatRef;
 pub use max_sim::{Chamfer, MaxSim, MaxSimError};
-pub use query_computer::QueryComputer;
+pub use query_computer::{DynQueryComputer, QueryComputer};
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
index 9bb348a6a..cb575c09f 100644
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
+++ b/diskann-quantization/src/multi_vector/distance/query_computer/f16.rs
@@ -19,6 +19,18 @@ impl QueryComputer<half::f16> {
     pub fn new(query: MatRef<'_, Standard<half::f16>>) -> Self {
         diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
     }
+
+    /// Build an f16 query computer pinned to a specific architecture token.
+    ///
+    /// See [`QueryComputer::<f32>::from_arch`] for the rationale and usage.
+    pub fn from_arch<A>(query: MatRef<'_, Standard<half::f16>>, arch: A) -> Self
+    where
+        A: Architecture,
+        BuildComputer:
+            for<'a> diskann_wide::arch::Target1<A, Self, MatRef<'a, Standard<half::f16>>>,
+    {
+        arch.run1(BuildComputer, query)
+    }
 }
 
 impl<A, const GROUP: usize> DynQueryComputer<half::f16>
@@ -51,8 +63,9 @@ where
     }
 }
 
+/// Architecture-dispatch target for `QueryComputer::<half::f16>` construction.
 #[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
+pub struct BuildComputer;
 
 impl diskann_wide::arch::Target1<Scalar, QueryComputer<half::f16>, MatRef<'_, Standard<half::f16>>>
     for BuildComputer
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
index 9ff16b8b4..f8193ad7e 100644
--- a/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
+++ b/diskann-quantization/src/multi_vector/distance/query_computer/f32.rs
@@ -19,6 +19,20 @@ impl QueryComputer<f32> {
     pub fn new(query: MatRef<'_, Standard<f32>>) -> Self {
         diskann_wide::arch::dispatch1_no_features(BuildComputer, query)
     }
+
+    /// Build an f32 query computer pinned to a specific architecture token.
+    ///
+    /// The caller obtains the token via `Scalar::new()` (always available)
+    /// or `V3::new_checked()` / `V4::new_checked()` / `Neon::new_checked()`
+    /// (which check CPU support). Use this constructor to A/B compare kernels
+    /// across ISAs on the same machine.
+    pub fn from_arch<A>(query: MatRef<'_, Standard<f32>>, arch: A) -> Self
+    where
+        A: Architecture,
+        BuildComputer: for<'a> diskann_wide::arch::Target1<A, Self, MatRef<'a, Standard<f32>>>,
+    {
+        arch.run1(BuildComputer, query)
+    }
 }
 
 impl<A, const GROUP: usize> DynQueryComputer<f32> for Prepared<A, BlockTransposed<f32, GROUP>>
@@ -50,8 +64,9 @@ where
     }
 }
 
+/// Architecture-dispatch target for `QueryComputer::<f32>` construction.
 #[derive(Debug, Clone, Copy)]
-pub(super) struct BuildComputer;
+pub struct BuildComputer;
 
 impl diskann_wide::arch::Target1<Scalar, QueryComputer<f32>, MatRef<'_, Standard<f32>>>
     for BuildComputer
diff --git a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
index fbe84fcd3..efd552076 100644
--- a/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
+++ b/diskann-quantization/src/multi_vector/distance/query_computer/mod.rs
@@ -43,6 +43,16 @@ pub struct QueryComputer<T: Copy> {
 }
 
 impl<T: Copy> QueryComputer<T> {
+    /// Wrap any [`DynQueryComputer<T>`] implementation as a `QueryComputer<T>`.
+    ///
+    /// This is the public seam for experimental kernels: implement
+    /// [`DynQueryComputer<T>`] on a custom struct, then wrap it here so the
+    /// existing [`QueryComputer::max_sim`] / [`QueryComputer::chamfer`] veneer
+    /// works against it.
+    pub fn from_dyn(inner: Box<dyn DynQueryComputer<T>>) -> Self {
+        Self { inner }
+    }
+
     /// Number of logical (non-padded) query vectors.
     #[inline]
     pub fn nrows(&self) -> usize {
@@ -88,8 +98,27 @@ impl<T: Copy> QueryComputer<T> {
     }
 }
 
-trait DynQueryComputer<T: Copy>: std::fmt::Debug + Send + Sync {
+/// Object-safe interface for "anything that can compute MaxSim for a [`QueryComputer<T>`]".
+///
+/// The library's own architecture-dispatched path implements this on the
+/// internal `Prepared<A, Q>` carriers. External crates implement it on their
+/// own structs and wrap via [`QueryComputer::from_dyn`].
+///
+/// # Contract
+///
+/// - [`compute_max_sim`](Self::compute_max_sim) is only invoked by
+///   [`QueryComputer::max_sim`], which has already asserted
+///   `scores.len() == self.nrows()` and short-circuited the zero-doc case.
+///   Implementations may rely on `scores.len() == self.nrows()` and
+///   `doc.num_vectors() > 0`.
+/// - Implementations must populate all `nrows()` entries of `scores`.
+///   [`QueryComputer::chamfer`] sums every entry, so leaving any trailing
+///   slot unwritten would silently corrupt the result.
+pub trait DynQueryComputer<T: Copy>: std::fmt::Debug + Send + Sync {
+    /// Compute MaxSim into `scores`. See trait-level docs for the contract.
     fn compute_max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+
+    /// Number of query rows.
     fn nrows(&self) -> usize;
 }
 
@@ -287,4 +316,38 @@ mod tests {
 
     test_matches_fallback!(f32, f32, 1e-10, "f32 ");
     test_matches_fallback!(f16, half::f16, 1e-10, "f16 ");
+
+    // ============================================================
+    // from_dyn: wrap a trivial custom DynQueryComputer.
+    // ============================================================
+    #[derive(Debug)]
+    struct ConstantComputer {
+        nrows: usize,
+        value: f32,
+    }
+
+    impl DynQueryComputer<f32> for ConstantComputer {
+        fn compute_max_sim(&self, _doc: MatRef<'_, Standard<f32>>, scores: &mut [f32]) {
+            for s in scores.iter_mut() {
+                *s = self.value;
+            }
+        }
+        fn nrows(&self) -> usize {
+            self.nrows
+        }
+    }
+
+    #[test]
+    fn from_dyn_wraps_custom_impl() {
+        let computer = QueryComputer::<f32>::from_dyn(Box::new(ConstantComputer {
+            nrows: 3,
+            value: -1.5,
+        }));
+        assert_eq!(computer.nrows(), 3);
+
+        let doc = make_mat(&[1.0f32, 0.0, 0.0, 1.0], 2, 2);
+        let mut scores = vec![0.0f32; 3];
+        computer.max_sim(doc, &mut scores);
+        assert_eq!(scores, vec![-1.5, -1.5, -1.5]);
+    }
 }
diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
index 70629d44c..734cb7247 100644
--- a/diskann-quantization/src/multi_vector/matrix.rs
+++ b/diskann-quantization/src/multi_vector/matrix.rs
@@ -712,6 +712,22 @@ impl<T: NewCloned> Clone for Mat<T> {
 }
 
 impl<T: Copy> Mat<Standard<T>> {
+    /// Create a new matrix by invoking `f` once per element in row-major order.
+    ///
+    /// # Example
+    /// ```
+    /// use diskann_quantization::multi_vector::{Mat, Standard};
+    /// let mut n = 0;
+    /// let mat = Mat::from_fn(Standard::<i32>::new(1, 4).unwrap(), || { n += 1; n });
+    /// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]);
+    /// ```
+    pub fn from_fn<F: FnMut() -> T>(repr: Standard<T>, mut f: F) -> Self {
+        let b: Box<[T]> = (0..repr.num_elements()).map(|_| f()).collect();
+
+        // SAFETY: By construction, `b` has length `repr.num_elements()`.
+        unsafe { repr.box_to_mat(b) }
+    }
+
     /// Returns the raw dimension (columns) of the vectors in the matrix.
     #[inline]
     pub fn vector_dim(&self) -> usize {
@@ -1767,6 +1783,18 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_standard_from_fn() {
+        let mut counter: i32 = 0;
+        let m = Mat::from_fn(Standard::<i32>::new(2, 3).unwrap(), || {
+            let v = counter;
+            counter += 1;
+            v
+        });
+
+        assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]);
+    }
+
     #[test]
     fn matref_new_slice_length_error() {
         let repr = Standard::<u32>::new(3, 4).unwrap();