Skip to content

Commit 3e95c46

Browse files
authored
Merge pull request #75 from github/sc-20250725-required-features
Testing features
2 parents 7c58e40 + 9e717e7 commit 3e95c46

7 files changed

Lines changed: 91 additions & 39 deletions

File tree

crates/geo_filters/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ bench = false
1414

1515
[features]
1616
default = []
17+
test-support = ["dep:rand", "dep:rand_chacha"]
18+
serde = ["dep:serde"]
1719
evaluation = [
1820
"dep:clap",
1921
"dep:hyperloglogplus",
@@ -31,6 +33,8 @@ once_cell = "1.18"
3133
rand = { version = "0.9", optional = true }
3234
rayon = { version = "1.7", optional = true }
3335
regex = { version = "1", optional = true }
36+
serde = { version = "1.0", default-features = false, optional = true }
37+
rand_chacha = { version = "0.9", optional = true }
3438

3539
[dev-dependencies]
3640
criterion = "0.7"

crates/geo_filters/evaluation/performance.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
use criterion::{black_box, criterion_group, criterion_main, Criterion};
1+
use std::hint::black_box;
2+
3+
use criterion::{criterion_group, criterion_main, Criterion};
24
use geo_filters::build_hasher::UnstableDefaultBuildHasher;
35
use geo_filters::config::VariableConfig;
46
use geo_filters::diff_count::{GeoDiffCount, GeoDiffCount13};

crates/geo_filters/src/config.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,13 +353,14 @@ pub(crate) fn take_ref<I: Iterator>(iter: &mut I, n: usize) -> impl Iterator<Ite
353353

354354
#[cfg(test)]
355355
pub(crate) mod tests {
356-
use rand::{rngs::StdRng, RngCore};
356+
use rand::RngCore;
357+
use rand_chacha::ChaCha12Rng;
357358

358359
use crate::{Count, Method};
359360

360361
/// Runs estimation trials and returns the average precision and variance.
361362
pub(crate) fn test_estimate<M: Method, C: Count<M>>(
362-
rnd: &mut StdRng,
363+
rnd: &mut ChaCha12Rng,
363364
f: impl Fn() -> C,
364365
) -> (f32, f32) {
365366
let cnt = 10000usize;

crates/geo_filters/src/config/lookup.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ impl HashToBucketLookup {
4545

4646
#[cfg(test)]
4747
mod tests {
48-
use rand::{rngs::StdRng, RngCore};
48+
use rand::RngCore;
49+
use rand_chacha::ChaCha12Rng;
4950

5051
use crate::{
5152
config::{hash_to_bucket, phi_f64},
@@ -70,7 +71,7 @@ mod tests {
7071
});
7172
}
7273

73-
fn lookup_random_hashes_variance<const B: usize>(rnd: &mut StdRng, n: u64) -> f64 {
74+
fn lookup_random_hashes_variance<const B: usize>(rnd: &mut ChaCha12Rng, n: u64) -> f64 {
7475
let phi = phi_f64(B);
7576
let buckets = HashToBucketLookup::new(B);
7677

crates/geo_filters/src/diff_count.rs

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ mod sim_hash;
1818

1919
use bitvec::*;
2020
pub use config::{GeoDiffConfig13, GeoDiffConfig7};
21+
pub use sim_hash::{SimHash, SIM_BUCKETS, SIM_BUCKET_SIZE};
2122

2223
/// Diff count filter with a relative error standard deviation of ~0.125.
2324
pub type GeoDiffCount7<'a> = GeoDiffCount<'a, GeoDiffConfig7>;
@@ -302,7 +303,7 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
302303

303304
/// Create a new [`GeoDiffCount`] from a slice of bytes
304305
#[cfg(target_endian = "little")]
305-
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
306+
pub fn from_bytes_with_config(c: C, buf: &'a [u8]) -> Self {
306307
if buf.is_empty() {
307308
return Self::new(c);
308309
}
@@ -338,6 +339,53 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
338339
bytes_written += self.lsb.write(writer)?;
339340
Ok(bytes_written)
340341
}
342+
343+
#[cfg(any(test, feature = "test-support"))]
344+
pub fn from_ones_with_config(config: C, ones: impl IntoIterator<Item = C::BucketType>) -> Self {
345+
let mut result = Self::new(config);
346+
for one in ones {
347+
result.xor_bit(one);
348+
}
349+
result
350+
}
351+
352+
#[cfg(any(test, feature = "test-support"))]
353+
pub fn iter_ones(&self) -> impl Iterator<Item = C::BucketType> + '_ {
354+
iter_ones(self.bit_chunks().peekable()).map(C::BucketType::from_usize)
355+
}
356+
357+
/// Generate a pseudo-random filter. The RNG used to build the filter
358+
/// is seeded using the number of items so for a given number of items
359+
/// the resulting geofilter should always be the same.
360+
#[cfg(any(test, feature = "test-support"))]
361+
pub fn pseudorandom_filter_with_config(config: C, items: usize) -> Self {
362+
use rand::RngCore;
363+
use rand_chacha::rand_core::SeedableRng;
364+
365+
let mut rng = rand_chacha::ChaCha12Rng::seed_from_u64(items as u64);
366+
let mut filter = Self::new(config);
367+
for _ in 0..items {
368+
filter.push_hash(rng.next_u64());
369+
}
370+
filter
371+
}
372+
}
373+
374+
impl<'a, C: GeoConfig<Diff> + Default> GeoDiffCount<'a, C> {
375+
#[cfg(target_endian = "little")]
376+
pub fn from_bytes(buf: &'a [u8]) -> Self {
377+
Self::from_bytes_with_config(C::default(), buf)
378+
}
379+
380+
#[cfg(any(test, feature = "test-support"))]
381+
pub fn from_ones(ones: impl IntoIterator<Item = C::BucketType>) -> Self {
382+
Self::from_ones_with_config(C::default(), ones)
383+
}
384+
385+
#[cfg(any(test, feature = "test-support"))]
386+
pub fn pseudorandom_filter(items: usize) -> Self {
387+
Self::pseudorandom_filter_with_config(C::default(), items)
388+
}
341389
}
342390

343391
/// Applies a repeated bit mask to the underlying filter.
@@ -419,11 +467,12 @@ mod tests {
419467
use std::io::Write;
420468

421469
use itertools::Itertools;
422-
use rand::{rngs::StdRng, seq::IteratorRandom, RngCore};
470+
use rand::{seq::IteratorRandom, RngCore};
471+
use rand_chacha::ChaCha12Rng;
423472

424473
use crate::{
425474
build_hasher::UnstableDefaultBuildHasher,
426-
config::{iter_ones, tests::test_estimate, FixedConfig},
475+
config::{tests::test_estimate, FixedConfig},
427476
test_rng::prng_test_harness,
428477
};
429478

@@ -458,8 +507,8 @@ mod tests {
458507

459508
#[test]
460509
fn test_xor() {
461-
let a = GeoDiffCount7::from_ones(Default::default(), 0..1000);
462-
let b = GeoDiffCount7::from_ones(Default::default(), 10..1010);
510+
let a = GeoDiffCount7::from_ones(0..1000);
511+
let b = GeoDiffCount7::from_ones(10..1010);
463512
let c = xor(&a, &b);
464513
let d = xor(&a, &b);
465514
assert_eq!(a.iter_ones().count(), 1000);
@@ -479,7 +528,7 @@ mod tests {
479528
m.xor_bit(10);
480529
assert!(m.iter_ones().collect_vec().is_empty());
481530

482-
let mut m = GeoDiffCount7::from_ones(Default::default(), 0..100);
531+
let mut m = GeoDiffCount7::from_ones(0..100);
483532
assert_eq!(m.iter_ones().count(), 100);
484533
m.xor_bit(10);
485534
assert_eq!(m.iter_ones().count(), 99);
@@ -561,20 +610,19 @@ mod tests {
561610
// masked bitset : 010000 100100 000000
562611
// after compression : 01 0 10 1 00 0
563612
// bitset of the returned filter : 010 101000
564-
let m = GeoDiffCount7::from_ones(Default::default(), [16, 15, 13, 11, 9, 8, 6, 3, 1]);
613+
let m = GeoDiffCount7::from_ones([16, 15, 13, 11, 9, 8, 6, 3, 1]);
565614
let n = masked(&m, 0b110100, 6);
566615
assert_eq!(n.iter_ones().collect_vec(), vec![16, 11, 8]);
567616

568617
for i in 0..100 {
569-
let m = GeoDiffCount7::from_ones(Default::default(), (0..i).collect_vec());
618+
let m = GeoDiffCount7::from_ones((0..i).collect_vec());
570619
let n = masked(&m, 0b111, 3);
571620
assert_eq!(m, n);
572621
}
573622

574623
for i in 0..300 {
575-
let m = GeoDiffCount7::from_ones(Default::default(), (0..i).collect_vec());
576-
let slow =
577-
GeoDiffCount::from_ones(Default::default(), masked(&m, 0b110, 3).iter_ones());
624+
let m = GeoDiffCount7::from_ones((0..i).collect_vec());
625+
let slow = GeoDiffCount::from_ones(masked(&m, 0b110, 3).iter_ones());
578626
let n = masked(&m, 0b110, 3);
579627
assert_eq!(slow, n, "in iteration: {i}");
580628
}
@@ -626,20 +674,6 @@ mod tests {
626674
assert_eq!(vec![17, 11, 7], a.msb.iter().copied().collect_vec());
627675
}
628676

629-
impl<C: GeoConfig<Diff>> GeoDiffCount<'_, C> {
630-
fn from_ones(config: C, ones: impl IntoIterator<Item = C::BucketType>) -> Self {
631-
let mut result = Self::new(config);
632-
for one in ones {
633-
result.xor_bit(one);
634-
}
635-
result
636-
}
637-
638-
fn iter_ones(&self) -> impl Iterator<Item = C::BucketType> + '_ {
639-
iter_ones(self.bit_chunks().peekable()).map(C::BucketType::from_usize)
640-
}
641-
}
642-
643677
#[test]
644678
fn test_serialization_empty() {
645679
let before = GeoDiffCount7::default();
@@ -649,15 +683,15 @@ mod tests {
649683

650684
assert_eq!(writer.len(), 0);
651685

652-
let after = GeoDiffCount7::from_bytes(before.config.clone(), &writer);
686+
let after = GeoDiffCount7::from_bytes_with_config(before.config.clone(), &writer);
653687

654688
assert_eq!(before, after);
655689
}
656690

657691
// This helper exists in order to easily test serializing types with different
658692
// bucket types in the MSB sparse bit field representation. See tests below.
659693
#[cfg(target_endian = "little")]
660-
fn serialization_round_trip<C: GeoConfig<Diff> + Default>(rnd: &mut StdRng) {
694+
fn serialization_round_trip<C: GeoConfig<Diff> + Default>(rnd: &mut ChaCha12Rng) {
661695
// Run 100 simulations of random values being put into
662696
// a diff counter. "Serializing" to a vector to emulate
663697
// writing to a disk, and then deserializing and asserting
@@ -676,7 +710,10 @@ mod tests {
676710
let pad_amount = (0..8).choose(rnd).unwrap();
677711
writer.write_all(&padding[..pad_amount]).unwrap();
678712
before.write(&mut writer).unwrap();
679-
let after = GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad_amount..]);
713+
let after = GeoDiffCount::<'_, C>::from_bytes_with_config(
714+
before.config.clone(),
715+
&writer[pad_amount..],
716+
);
680717
assert_eq!(before, after);
681718
}
682719

crates/geo_filters/src/diff_count/sim_hash.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,22 @@ use crate::Diff;
1111

1212
use super::BitVec;
1313

14+
// TODO migrate these const values to be defined in configuration
15+
// The current values are only really appropriate for smaller
16+
// configurations
17+
1418
/// Number of bits covered by each SimHash bucket.
15-
pub(crate) const SIM_BUCKET_SIZE: usize = 6;
19+
pub const SIM_BUCKET_SIZE: usize = 6;
1620
/// Number of consecutive SimHash buckets used for searching.
17-
pub(crate) const SIM_BUCKETS: usize = 20;
21+
pub const SIM_BUCKETS: usize = 20;
1822

1923
pub type BucketId = usize;
2024

2125
/// SimHash is a hash computed over a continuous range of bits from a GeoDiffCount.
2226
/// It is used to quickly find similar sets with a reverse index.
2327
#[derive(Copy, Clone, Default, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
28+
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
29+
#[cfg_attr(feature = "serde", serde(transparent))]
2430
pub struct SimHash(pub u64);
2531

2632
impl SimHash {

crates/geo_filters/src/test_rng.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::panic::{catch_unwind, resume_unwind, AssertUnwindSafe};
22

3-
use rand::{rngs::StdRng, SeedableRng as _};
3+
use rand::SeedableRng as _;
4+
use rand_chacha::ChaCha12Rng;
45

56
/// Provides a seeded random number generator to tests which require some
67
/// degree of randomization. If the test panics the harness will print the
@@ -12,7 +13,7 @@ use rand::{rngs::StdRng, SeedableRng as _};
1213
/// is only ran once with this seed.
1314
pub fn prng_test_harness<F>(iterations: usize, mut test_fn: F)
1415
where
15-
F: FnMut(&mut StdRng),
16+
F: FnMut(&mut ChaCha12Rng),
1617
{
1718
let maybe_manual_seed = std::env::var("TEST_SEED")
1819
.map(|s| s.parse::<u64>().expect("Parse TEST_SEED to u64"))
@@ -21,12 +22,12 @@ where
2122
let maybe_panic = catch_unwind(AssertUnwindSafe(|| {
2223
if let Some(manual_seed) = maybe_manual_seed {
2324
seed = manual_seed;
24-
let mut rng = StdRng::seed_from_u64(seed);
25+
let mut rng = ChaCha12Rng::seed_from_u64(seed);
2526
test_fn(&mut rng);
2627
} else {
2728
for _ in 0..iterations {
2829
seed = rand::random();
29-
let mut rng = StdRng::seed_from_u64(seed);
30+
let mut rng = ChaCha12Rng::seed_from_u64(seed);
3031
test_fn(&mut rng);
3132
}
3233
}

0 commit comments

Comments
 (0)