@@ -4,6 +4,7 @@ use std::borrow::Cow;
44use std:: cmp:: Ordering ;
55use std:: hash:: BuildHasher as _;
66use std:: mem:: { size_of, size_of_val} ;
7+ use std:: ops:: Deref as _;
78
89use crate :: config:: {
910 count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones,
@@ -13,7 +14,6 @@ use crate::{Count, Diff};
1314
1415mod bitvec;
1516mod config;
16- mod serde;
1717mod sim_hash;
1818
1919use bitvec:: * ;
@@ -216,16 +216,11 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
216216 let msb = self . msb . to_mut ( ) ;
217217 match msb. binary_search_by ( |k| bucket. cmp ( k) ) {
218218 Ok ( idx) => {
219- // The bit is already set in the MSB sparse bitset, remove it (XOR)
220219 msb. remove ( idx) ;
221-
222- // We have removed a value from our MSB, move a value in the
223- // LSB into the MSB
224220 let ( first, second) = {
225221 let mut lsb = iter_ones ( self . lsb . bit_chunks ( ) . peekable ( ) ) ;
226222 ( lsb. next ( ) , lsb. next ( ) )
227223 } ;
228-
229224 let new_smallest = if let Some ( smallest) = first {
230225 msb. push ( C :: BucketType :: from_usize ( smallest) ) ;
231226 second. map ( |_| smallest) . unwrap_or ( 0 )
@@ -243,12 +238,10 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
243238 . pop ( )
244239 . expect ( "we should have at least one element!" )
245240 . into_usize ( ) ;
246-
247241 let new_smallest = msb
248242 . last ( )
249243 . expect ( "should have at least one element" )
250244 . into_usize ( ) ;
251-
252245 // ensure LSB bit vector has the space for `smallest`
253246 self . lsb . resize ( new_smallest) ;
254247 self . lsb . toggle ( smallest) ;
@@ -293,6 +286,57 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
293286 self . lsb. num_bits( ) ,
294287 ) ;
295288 }
289+
290+ // Serialization:
291+ //
292+ // Since most of our target platforms are little endian there are more optimised approaches
293+ // for little endian platforms, just splatting the bytes into the writer. This is contrary
294+ // to the usual "network endian" approach where big endian is the default, but most of our
295+ // consumers are little endian so it makes sense for this to be the optimal approach.
296+ //
297+ // For now we do not support big endian platforms. In the future we might add a big endian
298+ // platform specific implementation which is able to read the little endian serialized
299+ // representation. For now, if you attempt to serialize a filter on a big endian platform
300+ // you get a panic.
301+
302+ /// Create a new [`GeoDiffCount`] from a slice of bytes
303+ #[ cfg( target_endian = "little" ) ]
304+ pub fn from_bytes ( c : C , buf : & ' a [ u8 ] ) -> Self {
305+ if buf. is_empty ( ) {
306+ return Self :: new ( c) ;
307+ }
308+ // The number of most significant bits stores in the MSB sparse repr
309+ let msb_len = ( buf. len ( ) / size_of :: < C :: BucketType > ( ) ) . min ( c. max_msb_len ( ) ) ;
310+ let msb = unsafe {
311+ std:: mem:: transmute :: < & [ u8 ] , & [ C :: BucketType ] > ( std:: slice:: from_raw_parts (
312+ buf. as_ptr ( ) ,
313+ msb_len,
314+ ) )
315+ } ;
316+ // The number of bytes representing the MSB - this is how many bytes we need to
317+ // skip over to reach the LSB
318+ let msb_bytes_len = msb_len * size_of :: < C :: BucketType > ( ) ;
319+ Self {
320+ config : c,
321+ msb : Cow :: Borrowed ( msb) ,
322+ lsb : BitVec :: from_bytes ( & buf[ msb_bytes_len..] ) ,
323+ }
324+ }
325+
326+ #[ cfg( target_endian = "little" ) ]
327+ pub fn write < W : std:: io:: Write > ( & self , writer : & mut W ) -> std:: io:: Result < usize > {
328+ if self . msb . is_empty ( ) {
329+ return Ok ( 0 ) ;
330+ }
331+ let msb_buckets = self . msb . deref ( ) ;
332+ let msb_bytes = unsafe {
333+ std:: slice:: from_raw_parts ( msb_buckets. as_ptr ( ) as * const u8 , size_of_val ( msb_buckets) )
334+ } ;
335+ writer. write_all ( msb_bytes) ?;
336+ let mut bytes_written = msb_bytes. len ( ) ;
337+ bytes_written += self . lsb . write ( writer) ?;
338+ Ok ( bytes_written)
339+ }
296340}
297341
298342/// Applies a repeated bit mask to the underlying filter.
@@ -611,50 +655,45 @@ mod tests {
611655
612656 // This helper exists in order to easily test serializing types with different
613657 // bucket types in the MSB sparse bit field representation. See tests below.
658+ #[ cfg( target_endian = "little" ) ]
614659 fn serialization_round_trip < C : GeoConfig < Diff > + Default > ( ) {
615660 let mut rnd = rand:: rngs:: StdRng :: from_os_rng ( ) ;
616-
617661 // Run 100 simulations of random values being put into
618662 // a diff counter. "Serializing" to a vector to emulate
619663 // writing to a disk, and then deserializing and asserting
620664 // the filters are equal.
621665 for _ in 0 ..100 {
622666 let mut before = GeoDiffCount :: < ' _ , C > :: default ( ) ;
623-
624- // Select a random number of items to insert
667+ // Select a random number of items to insert.
625668 let items = ( 1 ..1000 ) . choose ( & mut rnd) . unwrap ( ) ;
626-
627669 for _ in 0 ..items {
628670 before. push_hash ( rnd. next_u64 ( ) ) ;
629671 }
630-
631672 let mut writer = vec ! [ ] ;
632-
633- // Insert some padding to emulate alignment issues with the slices
673+ // Insert some padding to emulate alignment issues with the slices.
634674 // A previous version of this test never panicked even though we were
635- // violating the alignment preconditions for the `from_raw_parts` function
675+ // violating the alignment preconditions for the `from_raw_parts` function.
636676 let padding = [ 0_u8 ; 8 ] ;
637677 let pad_amount = ( 0 ..8 ) . choose ( & mut rnd) . unwrap ( ) ;
638678 writer. write_all ( & padding[ ..pad_amount] ) . unwrap ( ) ;
639-
640679 before. write ( & mut writer) . unwrap ( ) ;
641-
642680 let after =
643681 GeoDiffCount :: < ' _ , C > :: from_bytes ( before. config . clone ( ) , & writer[ pad_amount..] ) ;
644-
645682 assert_eq ! ( before, after) ;
646683 }
647684 }
648685
649686 #[ test]
687+ #[ cfg( target_endian = "little" ) ]
650688 fn test_serialization_round_trip_7 ( ) {
651- // Uses a u16 for MSB buckets
689+ // Uses a u16 for MSB buckets.
652690 serialization_round_trip :: < GeoDiffConfig7 > ( ) ;
653691 }
654692
655693 #[ test]
694+ #[ cfg( target_endian = "little" ) ]
656695 fn test_serialization_round_trip_13 ( ) {
657- // Uses a u32 for MSB buckets
696+ // Uses a u32 for MSB buckets.
658697 serialization_round_trip :: < GeoDiffConfig13 > ( ) ;
659698 }
660699}
0 commit comments