Skip to content

Commit 8973143

Browse files
committed
linter and comments
1 parent 2005354 commit 8973143

3 files changed

Lines changed: 14 additions & 30 deletions

File tree

crates/string-offsets/benchmarks/Cargo.toml.bak

Lines changed: 0 additions & 14 deletions
This file was deleted.

crates/string-offsets/benchmarks/performance.rs

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,12 @@ use rand::{rng, Rng};
33
use string_offsets::StringOffsets;
44

55
fn construction_benchmark(c: &mut Criterion) {
6-
// Generate random input string
76
let mut group = c.benchmark_group("construction");
87
for size in [1000, 10000, 100000] {
98
let mut rng = rng();
10-
let random_input: String = (0..size).map(|_| rng.random::<u8>() as char).collect();
11-
12-
// Create benchmark group for throughput measurement
13-
14-
// Set throughput based on input size in bytes
9+
// Generate random ascii input.
10+
let random_input: String = (0..size).map(|_| rng.random_range(32u8..128) as char).collect();
1511
group.throughput(criterion::Throughput::Bytes(random_input.len() as u64));
16-
17-
// Run the benchmark
1812
group.bench_with_input(
1913
BenchmarkId::from_parameter(size),
2014
&random_input,

crates/string-offsets/src/lib.rs

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,14 @@ pub struct StringOffsets {
9595
/// the byte belongs.
9696
utf8_to_line: BitRank,
9797

98-
/// Encoded bitrank where the rank of a byte position corresponds to the char position to which
98+
/// Encoded bitrank where the start of a utf8 code point is marked with a 1 bit.
99+
/// The rank of a byte position + 1 corresponds to the char position + 1 to which
99100
/// the byte belongs.
100101
utf8_to_char: BitRank,
101102

102-
/// Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word
103-
/// position to which the byte belongs.
103+
/// Encoded bitrank where a multi word utf16 code point is marked with a 1 bit.
104+
/// Converting a byte position into a utf16 word position is achieved by combining utf8_to_char
105+
/// and utf8_to_utf16 rank information.
104106
utf8_to_utf16: BitRank,
105107

106108
/// Marks, for every line, whether it consists only of whitespace characters.
@@ -149,6 +151,11 @@ impl StringOffsets {
149151
self.line_begins.last().copied().unwrap_or(0) as usize
150152
}
151153

154+
/// Returns whether there are no bytes in the string.
155+
pub fn is_empty(&self) -> bool {
156+
self.line_begins.is_empty()
157+
}
158+
152159
/// Create a new converter to work with offsets into the given byte-string.
153160
///
154161
/// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
@@ -368,11 +375,8 @@ fn new_converter(content: &[u8]) -> StringOffsets {
368375
let mut line_begins = vec![0];
369376
let mut whitespace_only = vec![];
370377
let mut only_whitespaces = true; // true if all characters in the current line are whitespaces.
371-
for i in 0..content.len() {
372-
// In case of invalid utf8, we might get a utf8_len of 0.
373-
// In this case, we just treat the single byte character.
374-
// In principle, a single incorrect byte can break the whole decoding...
375-
let c = content[i];
378+
for (i, &c) in content.into_iter().enumerate() {
379+
// Note: We expect here proper utf8 encoded strings! Otherwise, the conversion will have undefined behaviour.
376380
if is_char_boundary(c) {
377381
utf8_builder.push(i);
378382
}

0 commit comments

Comments
 (0)