Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ env:
CARGO_TERM_COLOR: always
# The features we want to explicitly test. For example, the `flatbuffers-build` feature
# of `diskann-quantization` requires additional setup and so must not be included by default.
DISKANN_FEATURES: "virtual_storage,bf_tree,spherical-quantization,product-quantization,tracing,experimental_diversity_search,disk-index,flatbuffers,linalg,codegen"
DISKANN_FEATURES: "virtual_storage,spherical-quantization,product-quantization,tracing,experimental_diversity_search,disk-index,flatbuffers,linalg,codegen"

# Intel SDE version used for baseline and AVX-512 emulation jobs.
SDE_VERSION: "sde-external-10.7.0-2026-02-18-lin"
Expand Down
24 changes: 22 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ members = [
"diskann-benchmark",
"diskann-tools",
"vectorset",
"diskann-bftree",
]

default-members = [
Expand Down
43 changes: 43 additions & 0 deletions diskann-bftree/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
[package]
name = "diskann-bftree"
version.workspace = true
description.workspace = true
authors.workspace = true
documentation.workspace = true
license.workspace = true
edition.workspace = true
Comment thread
JordanMaples marked this conversation as resolved.
publish = false

[dependencies]
bf-tree.workspace = true
bytemuck = { workspace = true, features = ["must_cast"] }
diskann.workspace = true
diskann-providers.workspace = true
diskann-quantization = { workspace = true, features = ["flatbuffers"] }
diskann-utils.workspace = true
diskann-vector.workspace = true
half = { workspace = true, features = ["bytemuck", "num-traits"] }
futures-util.workspace = true
rand.workspace = true
serde = { workspace = true, features = ["derive"] }
serde_json.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }

[dev-dependencies]
diskann = { workspace = true, features = ["testing"] }
diskann-providers = { workspace = true, features = ["testing"] }
diskann-utils = { workspace = true, features = ["testing"] }
rstest.workspace = true
tempfile.workspace = true
tokio = { workspace = true, features = ["full"] }

[features]
default = []
experimental_diversity_search = ["diskann/experimental_diversity_search"]

[lints]
workspace = true

172 changes: 172 additions & 0 deletions diskann-bftree/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT license.
*/

//! BfTree-based data provider for DiskANN async indexes.
//!
//! This crate provides a [`BfTree`](bf_tree::BfTree)-backed implementation of the DiskANN
//! [`DataProvider`](diskann::provider::DataProvider) trait, enabling indexes that can
//! transparently spill to disk for datasets larger than available memory.

pub mod neighbors;
pub mod provider;
pub mod quant;
pub mod vectors;

// Accessors
pub use provider::{
AsVectorDtype, BfTreePaths, BfTreeProvider, BfTreeProviderParameters, CreateQuantProvider,
FullAccessor, GraphParams, Hidden, QuantAccessor, StartPoint, VectorDtype,
};

pub use bf_tree::Config;

use diskann::{
error::{RankedError, TransientError},
ANNError,
};

#[derive(Debug, Clone, Copy)]
pub struct NoStore;

/// Wrapper around [`bf_tree::ConfigError`] that implements [`std::error::Error`].
#[derive(Debug, Clone)]
pub struct ConfigError(pub bf_tree::ConfigError);

impl std::fmt::Display for ConfigError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "BfTree configuration error: {:?}", self.0)
}
}

impl std::error::Error for ConfigError {}

impl From<ConfigError> for ANNError {
#[track_caller]
#[inline(never)]
fn from(error: ConfigError) -> ANNError {
ANNError::new(diskann::ANNErrorKind::IndexError, error)
}
}

trait AsKey {
fn as_key(&self) -> &[u8];
}

impl AsKey for usize {
fn as_key(&self) -> &[u8] {
bytemuck::bytes_of(self)
}
}

////////////
// Errors //
////////////
#[derive(Debug)]
pub enum VectorError {
/// the vector has been explicitly deleted
Deleted,
/// the key was not found
NotFound,
}

#[derive(Debug)]
pub struct VectorUnavailable {
pub id: usize,
pub err: VectorError,
}

impl std::fmt::Display for VectorUnavailable {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self.err {
VectorError::Deleted => write!(f, "vector {} was deleted", self.id),
VectorError::NotFound => write!(f, "vector {} not found", self.id),
}
}
}

impl TransientError<ANNError> for VectorUnavailable {
fn acknowledge<D>(self, _why: D)
where
D: std::fmt::Display,
{
// no-op: we are expecting transient deletion errors during traversal
}

fn escalate<D>(self, why: D) -> ANNError
where
D: std::fmt::Display,
{
ANNError::log_index_error(format!("{self}, escalated: {why}"))
}
}

pub type AccessError = RankedError<VectorUnavailable, ANNError>;

/// Metrics recorded by [`DefaultContext`](diskann::provider::DefaultContext).
#[derive(Debug, Clone)]
#[cfg_attr(test, derive(serde::Serialize, serde::Deserialize))]
pub struct ContextMetrics {
pub spawns: usize,
pub clones: usize,
}

/// An atomic call counter used for test instrumentation.
///
/// Under `#[cfg(test)]`, this is a real atomic counter. In production builds,
/// all methods are no-ops that the compiler can eliminate entirely.
#[cfg(test)]
pub(crate) struct TestCallCount {
count: std::sync::atomic::AtomicUsize,
}

#[cfg(test)]
impl TestCallCount {
pub fn new() -> Self {
Self {
count: std::sync::atomic::AtomicUsize::new(0),
}
}

pub fn enabled() -> bool {
true
}

pub fn increment(&self) {
self.count
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}

pub fn get(&self) -> usize {
self.count.load(std::sync::atomic::Ordering::Relaxed)
}
}

#[cfg(not(test))]
#[allow(dead_code)]
pub(crate) struct TestCallCount {}
Comment thread
JordanMaples marked this conversation as resolved.

#[cfg(not(test))]
#[allow(dead_code)]
impl TestCallCount {
pub fn new() -> Self {
Self {}
}

pub fn enabled() -> bool {
false
}

pub fn increment(&self) {}

pub fn get(&self) -> usize {
0
}
}

impl Default for TestCallCount {
fn default() -> Self {
Self::new()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,24 @@

use std::marker::PhantomData;

use crate::AsKey;
use bf_tree::{BfTree, Config};
use bytemuck::{bytes_of, cast_slice, cast_slice_mut};
use diskann::{
ANNError, ANNResult,
graph::AdjacencyList,
provider::HasId,
utils::{IntoUsize, TryIntoVectorId, VectorId},
ANNError, ANNResult,
};

use super::super::common::TestCallCount;
use super::ConfigError;
use crate::TestCallCount;

pub struct NeighborProvider<I: VectorId> {
adjacency_list_index: BfTree,
dim: usize, // Max number of neighbors in a neighbor list + 1 for the neighbor count
pub num_get_calls: TestCallCount,
#[allow(dead_code)]
pub(crate) num_get_calls: TestCallCount,
_phantom: PhantomData<I>,
}

Expand Down Expand Up @@ -81,7 +83,7 @@ impl<I: VectorId> NeighborProvider<I> {

// Serialize the key, vector_id, into a byte string, &[u8]
let i = vector_id.into_usize();
let key = bytes_of::<usize>(&i);
let key = i.as_key();

// Search and retrieve the corresponding neighbor list data as a byte string, &[u8], in the format of
// |VectorId|VectorId|...|Invalid|Invalid|VectorId (list length)|
Expand Down Expand Up @@ -166,7 +168,7 @@ impl<I: VectorId> NeighborProvider<I> {

// Serialize the key, vector_id, into a byte string, &[u8]
let i = vector_id.into_usize();
let key = bytes_of::<usize>(&i);
let key = i.as_key();

// Serialize the value, neighbor list, into a byte string, &u[8]
let neighbor_list_edges_in_byte = cast_slice::<I, u8>(neighbors);
Expand Down Expand Up @@ -218,7 +220,7 @@ impl<I: VectorId> NeighborProvider<I> {
// We avoid one data copy by directly writing to bf-tree instead of invoking set_neighbor()
// Also avoid a bunch of unnecssary checks
let i = vector_id.into_usize();
let key = bytes_of::<usize>(&i);
let key = i.as_key();
let value = cast_slice::<I, u8>(&neighbor_list);
self.adjacency_list_index.insert(key, value);
}
Expand All @@ -229,7 +231,7 @@ impl<I: VectorId> NeighborProvider<I> {
pub fn delete_vector(&self, vector_id: I) -> ANNResult<()> {
// Serialize the key, vector_id, into a byte string, &[u8]
let i = vector_id.into_usize();
let key = bytes_of::<usize>(&i);
let key = i.as_key();

self.adjacency_list_index.delete(key);
Ok(())
Expand Down
Loading
Loading