Skip to content

Commit c143f4f

Browse files
committed
Move serialization code to dedicated module
The changes split out serialization-related code from diff_count.rs and bitvec.rs into a new serde.rs module. This improves code organization by separating concerns.
1 parent 3e33988 commit c143f4f

3 files changed

Lines changed: 128 additions & 100 deletions

File tree

crates/geo_filters/src/diff_count.rs

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ use std::borrow::Cow;
44
use std::cmp::Ordering;
55
use std::hash::BuildHasher as _;
66
use std::mem::{size_of, size_of_val};
7-
use std::ops::Deref;
87

98
use crate::config::{
109
count_ones_from_bitchunks, count_ones_from_msb_and_lsb, iter_bit_chunks, iter_ones,
@@ -14,6 +13,7 @@ use crate::{Count, Diff};
1413

1514
mod bitvec;
1615
mod config;
16+
mod serde;
1717
mod sim_hash;
1818

1919
use bitvec::*;
@@ -87,49 +87,6 @@ impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
8787
}
8888
}
8989

90-
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
91-
if buf.is_empty() {
92-
return Self::new(c);
93-
}
94-
95-
// The number of most significant bits stores in the MSB sparse repr
96-
let msb_len = (buf.len() / size_of::<C::BucketType>()).min(c.max_msb_len());
97-
98-
let msb =
99-
unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) };
100-
101-
// The number of bytes representing the MSB - this is how many bytes we need to
102-
// skip over to reach the LSB
103-
let msb_bytes_len = msb_len * size_of::<C::BucketType>();
104-
105-
Self {
106-
config: c,
107-
msb: Cow::Borrowed(msb),
108-
lsb: BitVec::from_bytes(&buf[msb_bytes_len..]),
109-
}
110-
}
111-
112-
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
113-
if self.msb.is_empty() {
114-
return Ok(0);
115-
}
116-
117-
let msb_buckets = self.msb.deref();
118-
let msb_bytes = unsafe {
119-
std::slice::from_raw_parts(
120-
msb_buckets.as_ptr() as *const u8,
121-
msb_buckets.len() * size_of::<C::BucketType>(),
122-
)
123-
};
124-
writer.write_all(msb_bytes)?;
125-
126-
let mut bytes_written = msb_bytes.len();
127-
128-
bytes_written += self.lsb.write(writer)?;
129-
130-
Ok(bytes_written)
131-
}
132-
13390
/// `BitChunk`s can be processed much more efficiently than individual one bits!
13491
/// This function makes it possible to construct a GeoDiffCount instance directly from
13592
/// `BitChunk`s. It will extract the most significant bits first and then put the remainder

crates/geo_filters/src/diff_count/bitvec.rs

Lines changed: 55 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -34,62 +34,6 @@ impl PartialOrd for BitVec<'_> {
3434
}
3535

3636
impl BitVec<'_> {
37-
pub fn from_bytes(mut buf: &[u8]) -> Self {
38-
if buf.is_empty() {
39-
return Self::default();
40-
}
41-
42-
// The first byte of the serialized BitVec is used to indicate how many
43-
// of the bits in the left-most byte are *unoccupied*.
44-
// See [`BitVec::write`] implementation for how this is done.
45-
assert!(
46-
buf[0] < 64,
47-
"Number of unoccupied bits should be <64, got {}",
48-
buf[0]
49-
);
50-
51-
let num_bits = (buf.len() - 1) * 8 - buf[0] as usize;
52-
buf = &buf[1..];
53-
54-
assert_eq!(
55-
buf.len() % BYTES_PER_BLOCK,
56-
0,
57-
"buffer should be a multiple of 8 bytes, got {}",
58-
buf.len()
59-
);
60-
61-
let blocks = unsafe {
62-
std::mem::transmute(std::slice::from_raw_parts(
63-
buf.as_ptr(),
64-
buf.len() / BYTES_PER_BLOCK,
65-
))
66-
};
67-
let blocks = Cow::Borrowed(blocks);
68-
69-
Self { num_bits, blocks }
70-
}
71-
72-
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
73-
if self.is_empty() {
74-
return Ok(0);
75-
}
76-
77-
// First serialize the number of unoccupied bits in the last block as one byte.
78-
let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8;
79-
80-
writer.write_all(&[unoccupied_bits])?;
81-
82-
let blocks = self.blocks.deref();
83-
84-
let block_bytes = unsafe {
85-
std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK)
86-
};
87-
88-
writer.write_all(block_bytes)?;
89-
90-
Ok(block_bytes.len() + 1)
91-
}
92-
9337
/// Takes an iterator of `BitChunk` items as input and returns the corresponding `BitVec`.
9438
/// The order of `BitChunk`s doesn't matter for this function and `BitChunk` may be hitting
9539
/// the same block. In this case, the function will simply xor them together.
@@ -202,6 +146,61 @@ impl BitVec<'_> {
202146
let Self { num_bits, blocks } = self;
203147
size_of_val(num_bits) + blocks.len() * size_of::<u64>()
204148
}
149+
150+
pub fn from_bytes(mut buf: &[u8]) -> Self {
151+
if buf.is_empty() {
152+
return Self::default();
153+
}
154+
155+
// The first byte of the serialized BitVec is used to indicate how many
156+
// of the bits in the left-most byte are *unoccupied*.
157+
// See [`BitVec::write`] implementation for how this is done.
158+
assert!(
159+
buf[0] < 64,
160+
"Number of unoccupied bits should be <64, got {}",
161+
buf[0]
162+
);
163+
164+
let num_bits = (buf.len() - 1) * 8 - buf[0] as usize;
165+
buf = &buf[1..];
166+
167+
assert_eq!(
168+
buf.len() % BYTES_PER_BLOCK,
169+
0,
170+
"buffer should be a multiple of 8 bytes, got {}",
171+
buf.len()
172+
);
173+
174+
let blocks = unsafe {
175+
std::mem::transmute(std::slice::from_raw_parts(
176+
buf.as_ptr(),
177+
buf.len() / BYTES_PER_BLOCK,
178+
))
179+
};
180+
let blocks = Cow::Borrowed(blocks);
181+
182+
Self { num_bits, blocks }
183+
}
184+
185+
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
186+
if self.is_empty() {
187+
return Ok(0);
188+
}
189+
190+
// First serialize the number of unoccupied bits in the last block as one byte.
191+
let unoccupied_bits = 63 - ((self.num_bits - 1) % 64) as u8;
192+
193+
writer.write_all(&[unoccupied_bits])?;
194+
195+
let blocks = self.blocks.deref();
196+
let block_bytes = unsafe {
197+
std::slice::from_raw_parts(blocks.as_ptr() as *const u8, blocks.len() * BYTES_PER_BLOCK)
198+
};
199+
200+
writer.write_all(block_bytes)?;
201+
202+
Ok(block_bytes.len() + 1)
203+
}
205204
}
206205

207206
impl Index<usize> for BitVec<'_> {
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
//! Convert a [`GeoDiffCount`] to and from byte arrays.
2+
//!
3+
//! Since most of our target platforms are little endian there are more optimised approaches
4+
//! for little endian platforms, just splatting the bytes into the writer. This is contrary
5+
//! to the usual "network endian" approach where big endian is the default, but most of our
6+
//! consumers are little endian so it makes sense for this to be the optimal approach.
7+
//!
8+
//! We still need to support big endian platforms though, but they get a less efficient path.
9+
use std::{borrow::Cow, ops::Deref as _};
10+
11+
use crate::{config::GeoConfig, Diff};
12+
13+
use super::{bitvec::BitVec, GeoDiffCount};
14+
15+
impl<'a, C: GeoConfig<Diff>> GeoDiffCount<'a, C> {
16+
/// Create a new [`GeoDiffCount`] from a slice of bytes
17+
#[cfg(target_endian = "little")]
18+
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
19+
if buf.is_empty() {
20+
return Self::new(c);
21+
}
22+
23+
// The number of most significant bits stores in the MSB sparse repr
24+
let msb_len = (buf.len() / size_of::<C::BucketType>()).min(c.max_msb_len());
25+
26+
let msb =
27+
unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const C::BucketType, msb_len) };
28+
29+
// The number of bytes representing the MSB - this is how many bytes we need to
30+
// skip over to reach the LSB
31+
let msb_bytes_len = msb_len * size_of::<C::BucketType>();
32+
33+
Self {
34+
config: c,
35+
msb: Cow::Borrowed(msb),
36+
lsb: BitVec::from_bytes(&buf[msb_bytes_len..]),
37+
}
38+
}
39+
40+
/// Create a new [`GeoDiffCount`] from a slice of bytes
41+
#[cfg(target_endian = "big")]
42+
pub fn from_bytes(c: C, buf: &'a [u8]) -> Self {
43+
unimplemented!("not supported on big endian platforms")
44+
}
45+
46+
#[cfg(target_endian = "little")]
47+
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
48+
if self.msb.is_empty() {
49+
return Ok(0);
50+
}
51+
52+
let msb_buckets = self.msb.deref();
53+
let msb_bytes = unsafe {
54+
std::slice::from_raw_parts(
55+
msb_buckets.as_ptr() as *const u8,
56+
msb_buckets.len() * size_of::<C::BucketType>(),
57+
)
58+
};
59+
writer.write_all(msb_bytes)?;
60+
61+
let mut bytes_written = msb_bytes.len();
62+
63+
bytes_written += self.lsb.write(writer)?;
64+
65+
Ok(bytes_written)
66+
}
67+
68+
#[cfg(target_endian = "big")]
69+
pub fn write<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<usize> {
70+
unimplemented!("not supported on big endian platforms")
71+
}
72+
}

0 commit comments

Comments
 (0)