Skip to content

Commit 3fc9ff3

Browse files
committed
Reworks for geofilter serialization work
- Remove serde module. Put code into diff_count.rs. - Remove bonus comments - Remove panic-y big endian serialization stuff. - Squish together lines
1 parent a80c3e6 commit 3fc9ff3

3 files changed

Lines changed: 63 additions & 99 deletions

File tree

crates/geo_filters/src/diff_count.rs

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use std::borrow::Cow;
44
use std::cmp::Ordering;
55
use std::hash::BuildHasher as _;
66
use std::mem::{size_of, size_of_val};
7+
use std::ops::Deref as _;
78

89
use crate::config::{
910
count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones,
@@ -13,7 +14,6 @@ use crate::{Count, Diff};
1314

1415
mod bitvec;
1516
mod config;
16-
mod serde;
1717
mod sim_hash;
1818

1919
use bitvec::*;
@@ -216,16 +216,11 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
216216
let msb = self.msb.to_mut();
217217
match msb.binary_search_by(|k| bucket.cmp(k)) {
218218
Ok(idx) => {
219-
// The bit is already set in the MSB sparse bitset, remove it (XOR)
220219
msb.remove(idx);
221-
222-
// We have removed a value from our MSB, move a value in the
223-
// LSB into the MSB
224220
let (first, second) = {
225221
let mut lsb = iter_ones(self.lsb.bit_chunks().peekable());
226222
(lsb.next(), lsb.next())
227223
};
228-
229224
let new_smallest = if let Some(smallest) = first {
230225
msb.push(C::BucketType::from_usize(smallest));
231226
second.map(|_| smallest).unwrap_or(0)
@@ -243,12 +238,10 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
243238
.pop()
244239
.expect("we should have at least one element!")
245240
.into_usize();
246-
247241
let new_smallest = msb
248242
.last()
249243
.expect("should have at least one element")
250244
.into_usize();
251-
252245
// ensure LSB bit vector has the space for `smallest`
253246
self.lsb.resize(new_smallest);
254247
self.lsb.toggle(smallest);
@@ -293,6 +286,57 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
293286
self.lsb.num_bits(),
294287
);
295288
}
289+
290+
// Serialization:
291+
//
292+
// Since most of our target platforms are little endian there are more optimised approaches
293+
// for little endian platforms, just splatting the bytes into the writer. This is contrary
294+
// to the usual "network endian" approach where big endian is the default, but most of our
295+
// consumers are little endian so it makes sense for this to be the optimal approach.
296+
//
297+
// For now we do not support big endian platforms. In the future we might add a big endian
298+
// platform specific implementation which is able to read the little endian serialized
299+
// representation. For now, if you attempt to serialize a filter on a big endian platform
300+
// you get a panic.
301+
302+
/// Create a new [`GeoDiffCount`] from a slice of bytes
303+
#[cfg(target_endian = "little")]
304+
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
305+
if buf.is_empty() {
306+
return Self::new(c);
307+
}
308+
// The number of most significant bits stores in the MSB sparse repr
309+
let msb_len = (buf.len() / size_of::<C::BucketType>()).min(c.max_msb_len());
310+
let msb = unsafe {
311+
std::mem::transmute::<&[u8], &[C::BucketType]>(std::slice::from_raw_parts(
312+
buf.as_ptr(),
313+
msb_len,
314+
))
315+
};
316+
// The number of bytes representing the MSB - this is how many bytes we need to
317+
// skip over to reach the LSB
318+
let msb_bytes_len = msb_len * size_of::<C::BucketType>();
319+
Self {
320+
config: c,
321+
msb: Cow::Borrowed(msb),
322+
lsb: BitVec::from_bytes(&buf[msb_bytes_len..]),
323+
}
324+
}
325+
326+
#[cfg(target_endian = "little")]
327+
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
328+
if self.msb.is_empty() {
329+
return Ok(0);
330+
}
331+
let msb_buckets = self.msb.deref();
332+
let msb_bytes = unsafe {
333+
std::slice::from_raw_parts(msb_buckets.as_ptr() as *const u8, size_of_val(msb_buckets))
334+
};
335+
writer.write_all(msb_bytes)?;
336+
let mut bytes_written = msb_bytes.len();
337+
bytes_written += self.lsb.write(writer)?;
338+
Ok(bytes_written)
339+
}
296340
}
297341

298342
/// Applies a repeated bit mask to the underlying filter.
@@ -611,50 +655,45 @@ mod tests {
611655

612656
// This helper exists in order to easily test serializing types with different
613657
// bucket types in the MSB sparse bit field representation. See tests below.
658+
#[cfg(target_endian = "little")]
614659
fn serialization_round_trip<C: GeoConfig<Diff> + Default>() {
615660
let mut rnd = rand::rngs::StdRng::from_os_rng();
616-
617661
// Run 100 simulations of random values being put into
618662
// a diff counter. "Serializing" to a vector to emulate
619663
// writing to a disk, and then deserializing and asserting
620664
// the filters are equal.
621665
for _ in 0..100 {
622666
let mut before = GeoDiffCount::<'_, C>::default();
623-
624-
// Select a random number of items to insert
667+
// Select a random number of items to insert.
625668
let items = (1..1000).choose(&mut rnd).unwrap();
626-
627669
for _ in 0..items {
628670
before.push_hash(rnd.next_u64());
629671
}
630-
631672
let mut writer = vec![];
632-
633-
// Insert some padding to emulate alignment issues with the slices
673+
// Insert some padding to emulate alignment issues with the slices.
634674
// A previous version of this test never panicked even though we were
635-
// violating the alignment preconditions for the `from_raw_parts` function
675+
// violating the alignment preconditions for the `from_raw_parts` function.
636676
let padding = [0_u8; 8];
637677
let pad_amount = (0..8).choose(&mut rnd).unwrap();
638678
writer.write_all(&padding[..pad_amount]).unwrap();
639-
640679
before.write(&mut writer).unwrap();
641-
642680
let after =
643681
GeoDiffCount::<'_, C>::from_bytes(before.config.clone(), &writer[pad_amount..]);
644-
645682
assert_eq!(before, after);
646683
}
647684
}
648685

649686
#[test]
687+
#[cfg(target_endian = "little")]
650688
fn test_serialization_round_trip_7() {
651-
// Uses a u16 for MSB buckets
689+
// Uses a u16 for MSB buckets.
652690
serialization_round_trip::<GeoDiffConfig7>();
653691
}
654692

655693
#[test]
694+
#[cfg(target_endian = "little")]
656695
fn test_serialization_round_trip_13() {
657-
// Uses a u32 for MSB buckets
696+
// Uses a u32 for MSB buckets.
658697
serialization_round_trip::<GeoDiffConfig13>();
659698
}
660699
}

crates/geo_filters/src/diff_count/bitvec.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,14 @@ impl BitVec<'_> {
147147
size_of_val(num_bits) + blocks.len() * size_of::<u64>()
148148
}
149149

150+
#[cfg(target_endian = "little")]
150151
pub fn from_bytes(mut buf: &[u8]) -> Self {
151152
if buf.is_empty() {
152153
return Self::default();
153154
}
154155

155156
// The first byte of the serialized BitVec is used to indicate how many
156-
// of the bits in the left-most byte are *unoccupied*.
157+
// of the bits in the left-most u64 block are *unoccupied*.
157158
// See [`BitVec::write`] implementation for how this is done.
158159
assert!(
159160
buf[0] < 64,
@@ -182,6 +183,7 @@ impl BitVec<'_> {
182183
Self { num_bits, blocks }
183184
}
184185

186+
#[cfg(target_endian = "little")]
185187
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
186188
if self.is_empty() {
187189
return Ok(0);

crates/geo_filters/src/diff_count/serde.rs

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)