@@ -95,12 +95,14 @@ pub struct StringOffsets {
9595 /// the byte belongs.
9696 utf8_to_line : BitRank ,
9797
98- /// Encoded bitrank where the rank of a byte position corresponds to the char position to which
98+ /// Encoded bitrank where the start of a utf8 code point is marked with a 1 bit.
99+ /// The rank of a byte position + 1 corresponds to the char position + 1 to which
99100 /// the byte belongs.
100101 utf8_to_char : BitRank ,
101102
102- /// Encoded bitrank where the rank of a byte position corresponds to the UTF-16 encoded word
103- /// position to which the byte belongs.
103+ /// Encoded bitrank where a multi word utf16 code point is marked with a 1 bit.
104+ /// Converting a byte position into a utf16 word position is achieved by combining utf8_to_char
105+ /// and utf8_to_utf16 rank information.
104106 utf8_to_utf16 : BitRank ,
105107
106108 /// Marks, for every line, whether it consists only of whitespace characters.
@@ -149,6 +151,11 @@ impl StringOffsets {
149151 self . line_begins . last ( ) . copied ( ) . unwrap_or ( 0 ) as usize
150152 }
151153
154+ /// Returns whether there are no bytes in the string.
155+ pub fn is_empty ( & self ) -> bool {
156+ self . line_begins . is_empty ( )
157+ }
158+
152159 /// Create a new converter to work with offsets into the given byte-string.
153160 ///
154161 /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
@@ -368,11 +375,8 @@ fn new_converter(content: &[u8]) -> StringOffsets {
368375 let mut line_begins = vec ! [ 0 ] ;
369376 let mut whitespace_only = vec ! [ ] ;
370377 let mut only_whitespaces = true ; // true if all characters in the current line are whitespaces.
371- for i in 0 ..content. len ( ) {
372- // In case of invalid utf8, we might get a utf8_len of 0.
373- // In this case, we just treat the single byte character.
374- // In principle, a single incorrect byte can break the whole decoding...
375- let c = content[ i] ;
378+ for ( i, & c) in content. into_iter ( ) . enumerate ( ) {
379+ // Note: We expect here proper utf8 encoded strings! Otherwise, the conversion will have undefined behaviour.
376380 if is_char_boundary ( c) {
377381 utf8_builder. push ( i) ;
378382 }
0 commit comments