1616// under the License.
1717
1818//! Regex expressions
19+ use memchr:: memchr;
20+
1921use arrow:: array:: ArrayDataBuilder ;
2022use arrow:: array:: BufferBuilder ;
2123use arrow:: array:: GenericStringArray ;
@@ -199,6 +201,22 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
199201 . into_owned ( )
200202}
201203
204+ /// For anchored patterns like `^...(capture)....*$` where the replacement
205+ /// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
206+ /// `captures_read` with `CaptureLocations` for direct extraction — no
207+ /// `expand()`, no `String` allocation.
208+ fn try_build_short_extract_regex ( pattern : & str , replacement : & str ) -> Option < Regex > {
209+ if replacement != "${1}" || !pattern. starts_with ( '^' ) || !pattern. ends_with ( ".*$" ) {
210+ return None ;
211+ }
212+ let short = & pattern[ ..pattern. len ( ) - 3 ] ;
213+ let re = Regex :: new ( short) . ok ( ) ?;
214+ if re. captures_len ( ) != 2 {
215+ return None ;
216+ }
217+ Some ( re)
218+ }
219+
202220/// Replaces substring(s) matching a PCRE-like regular expression.
203221///
204222/// The full list of supported features and syntax can be found at
@@ -457,6 +475,14 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
457475 // with rust ones.
458476 let replacement = regex_replace_posix_groups ( replacement) ;
459477
478+ // For anchored patterns like ^...(capture)....*$, build a shorter
479+ // regex and use captures_read for direct extraction.
480+ let short_re = if limit == 1 {
481+ try_build_short_extract_regex ( & pattern, & replacement)
482+ } else {
483+ None
484+ } ;
485+
460486 let string_array_type = args[ 0 ] . data_type ( ) ;
461487 match string_array_type {
462488 DataType :: Utf8 | DataType :: LargeUtf8 => {
@@ -473,13 +499,37 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
473499 let mut new_offsets = BufferBuilder :: < T > :: new ( string_array. len ( ) + 1 ) ;
474500 new_offsets. append ( T :: zero ( ) ) ;
475501
476- string_array. iter ( ) . for_each ( |val| {
477- if let Some ( val) = val {
478- let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
479- vals. append_slice ( result. as_bytes ( ) ) ;
480- }
481- new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
482- } ) ;
502+ if let Some ( ref short_re) = short_re {
503+ let mut locs = short_re. capture_locations ( ) ;
504+ string_array. iter ( ) . for_each ( |val| {
505+ if let Some ( val) = val {
506+ if short_re. captures_read ( & mut locs, val) . is_some ( ) {
507+ let match_end = locs. get ( 0 ) . unwrap ( ) . 1 ;
508+ if memchr ( b'\n' , val[ match_end..] . as_bytes ( ) ) . is_none ( ) {
509+ if let Some ( ( start, end) ) = locs. get ( 1 ) {
510+ vals. append_slice ( & val. as_bytes ( ) [ start..end] ) ;
511+ }
512+ } else {
513+ // Newline in remainder: .*$ wouldn't match without 's' flag
514+ let result =
515+ re. replacen ( val, limit, replacement. as_str ( ) ) ;
516+ vals. append_slice ( result. as_bytes ( ) ) ;
517+ }
518+ } else {
519+ vals. append_slice ( val. as_bytes ( ) ) ;
520+ }
521+ }
522+ new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
523+ } ) ;
524+ } else {
525+ string_array. iter ( ) . for_each ( |val| {
526+ if let Some ( val) = val {
527+ let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
528+ vals. append_slice ( result. as_bytes ( ) ) ;
529+ }
530+ new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
531+ } ) ;
532+ }
483533
484534 let data = ArrayDataBuilder :: new ( GenericStringArray :: < T > :: DATA_TYPE )
485535 . len ( string_array. len ( ) )
@@ -494,12 +544,39 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
494544
495545 let mut builder = StringViewBuilder :: with_capacity ( string_view_array. len ( ) ) ;
496546
497- for val in string_view_array. iter ( ) {
498- if let Some ( val) = val {
499- let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
500- builder. append_value ( result) ;
501- } else {
502- builder. append_null ( ) ;
547+ if let Some ( ref short_re) = short_re {
548+ let mut locs = short_re. capture_locations ( ) ;
549+ for val in string_view_array. iter ( ) {
550+ if let Some ( val) = val {
551+ if short_re. captures_read ( & mut locs, val) . is_some ( ) {
552+ let match_end = locs. get ( 0 ) . unwrap ( ) . 1 ;
553+ if memchr ( b'\n' , val[ match_end..] . as_bytes ( ) ) . is_none ( ) {
554+ if let Some ( ( start, end) ) = locs. get ( 1 ) {
555+ builder. append_value ( & val[ start..end] ) ;
556+ } else {
557+ builder. append_value ( "" ) ;
558+ }
559+ } else {
560+ // Newline in remainder: .*$ wouldn't match without 's' flag
561+ let result =
562+ re. replacen ( val, limit, replacement. as_str ( ) ) ;
563+ builder. append_value ( result) ;
564+ }
565+ } else {
566+ builder. append_value ( val) ;
567+ }
568+ } else {
569+ builder. append_null ( ) ;
570+ }
571+ }
572+ } else {
573+ for val in string_view_array. iter ( ) {
574+ if let Some ( val) = val {
575+ let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
576+ builder. append_value ( result) ;
577+ } else {
578+ builder. append_null ( ) ;
579+ }
503580 }
504581 }
505582
0 commit comments