linter and comments

aneubeck · aneubeck · commit 897314383ef0 · 2025-03-26T08:55:28.000+01:00
diff --git a/crates/string-offsets/benchmarks/Cargo.toml.bak b/crates/string-offsets/benchmarks/Cargo.toml.bak
diff --git a/crates/string-offsets/benchmarks/performance.rs b/crates/string-offsets/benchmarks/performance.rs
@@ -3,18 +3,12 @@ use rand::{rng, Rng};
 use string_offsets::StringOffsets;
 
 fn construction_benchmark(c: &mut Criterion) {
-    // Generate random input string
     let mut group = c.benchmark_group("construction");
     for size in [1000, 10000, 100000] {
         let mut rng = rng();
-        let random_input: String = (0..size).map(|_| rng.random::<u8>() as char).collect();
-
-        // Create benchmark group for throughput measurement
-
-        // Set throughput based on input size in bytes
+        // Generate random ascii input.
+        let random_input: String = (0..size).map(|_| rng.random_range(32u8..128) as char).collect();
         group.throughput(criterion::Throughput::Bytes(random_input.len() as u64));
-
-        // Run the benchmark
         group.bench_with_input(
             BenchmarkId::from_parameter(size),
             &random_input,
diff --git a/crates/string-offsets/src/lib.rs b/crates/string-offsets/src/lib.rs
@@ -95,12 +95,14 @@ pub struct StringOffsets {
     /// the byte belongs.
     utf8_to_line: BitRank,
 
-    /// Encoded bitrank where the rank of a byte position corresponds to the char position to which
+    /// Encoded bitrank where the start of a utf8 code point is marked with a 1 bit.
+    /// The rank of a byte position + 1 corresponds to the char position + 1 to which
     /// the byte belongs.
     utf8_to_char: BitRank,
 
-    /// Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word
-    /// position to which the byte belongs.
+    /// Encoded bitrank where a multi word utf16 code point is marked with a 1 bit.
+    /// Converting a byte position into a utf16 word position is achieved by combining utf8_to_char
+    /// and utf8_to_utf16 rank information.
     utf8_to_utf16: BitRank,
 
     /// Marks, for every line, whether it consists only of whitespace characters.
@@ -149,6 +151,11 @@ impl StringOffsets {
         self.line_begins.last().copied().unwrap_or(0) as usize
     }
 
+    /// Returns whether there are no bytes in the string.
+    pub fn is_empty(&self) -> bool {
+        self.line_begins.is_empty()
+    }
+
     /// Create a new converter to work with offsets into the given byte-string.
     ///
     /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
@@ -368,11 +375,8 @@ fn new_converter(content: &[u8]) -> StringOffsets {
     let mut line_begins = vec![0];
     let mut whitespace_only = vec![];
     let mut only_whitespaces = true; // true if all characters in the current line are whitespaces.
-    for i in 0..content.len() {
-        // In case of invalid utf8, we might get a utf8_len of 0.
-        // In this case, we just treat the single byte character.
-        // In principle, a single incorrect byte can break the whole decoding...
-        let c = content[i];
+    for (i, &c) in content.into_iter().enumerate() {
+        // Note: We expect here proper utf8 encoded strings! Otherwise, the conversion will have undefined behaviour.
         if is_char_boundary(c) {
             utf8_builder.push(i);
         }