From b1b8de5d2e8f34a650f2980c7b2c468b4f7af02c Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Wed, 20 May 2026 08:08:40 +0100 Subject: [PATCH 1/2] Fix RegExp lookbehind direction Implement backward-direction matching for lookbehind subpatterns so captures, backreferences, nested lookaround, and sticky/global matches follow ECMAScript lookbehind semantics. Closes #612 --- source/units/Goccia.RegExp.Compiler.pas | 78 +++++++++++- source/units/Goccia.RegExp.VM.pas | 156 +++++++++++++++++------ tests/built-ins/RegExp/prototype/exec.js | 53 ++++++++ 3 files changed, 242 insertions(+), 45 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index e65d7c35..70c0b340 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -79,6 +79,12 @@ TRegExpStringSequence = record CodePoints: array of Cardinal; end; + TRegExpTermCode = record + Code: array of UInt32; + Length: Integer; + OriginalStart: Integer; + end; + TRegExpClassContents = record Ranges: array of TRegExpCharRange; RangeCount: Integer; @@ -101,6 +107,7 @@ TRegExpCompiler = class FUnicode: Boolean; FUnicodeSets: Boolean; FPendingCodeUnit: Integer; + FBackward: Boolean; function Peek: Char; function PeekAt(AOffset: Integer): Char; function AtEnd: Boolean; @@ -274,6 +281,7 @@ constructor TRegExpCompiler.Create(const APattern, AFlags: string); FUnicodeSets := HasRegExpFlag(AFlags, 'v'); FUnicode := HasRegExpFlag(AFlags, 'u') or FUnicodeSets; FPendingCodeUnit := -1; + FBackward := False; end; function TRegExpCompiler.Peek: Char; @@ -1995,6 +2003,7 @@ procedure TRegExpCompiler.CompileModifierGroup; procedure TRegExpCompiler.CompileGroup; var SaveAltDepth: Integer; + SavedBackward: Boolean; GroupName: string; CaptureIdx, I: Integer; SplitHole, JumpHole: Integer; @@ -2017,8 +2026,11 @@ procedure TRegExpCompiler.CompileGroup; begin SplitHole := EmitHole; FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, 0); + SavedBackward := FBackward; + FBackward := False; LookStart := CurrentPC; CompileDisjunction; + FBackward := SavedBackward; if not Match(')') then raise EConvertError.Create('Unterminated lookahead'); Emit(EncodeOp(RX_MATCH)); @@ -2029,7 +2041,10 @@ procedure TRegExpCompiler.CompileGroup; begin SplitHole := EmitHole; FCode[SplitHole] := EncodeOpBx(RX_LOOKAHEAD, 0); + SavedBackward := FBackward; + FBackward := False; CompileDisjunction; + FBackward := SavedBackward; if not Match(')') then raise EConvertError.Create('Unterminated negative lookahead'); Emit(EncodeOp(RX_MATCH)); @@ -2042,7 +2057,10 @@ procedure TRegExpCompiler.CompileGroup; begin SplitHole := EmitHole; FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, 0); + SavedBackward := FBackward; + FBackward := True; CompileDisjunction; + FBackward := SavedBackward; if not Match(')') then raise EConvertError.Create('Unterminated lookbehind'); Emit(EncodeOp(RX_MATCH)); @@ -2053,7 +2071,10 @@ procedure TRegExpCompiler.CompileGroup; begin SplitHole := EmitHole; FCode[SplitHole] := EncodeOpBx(RX_LOOKBEHIND, 0); + SavedBackward := FBackward; + FBackward := True; CompileDisjunction; + FBackward := SavedBackward; if not Match(')') then raise EConvertError.Create('Unterminated negative lookbehind'); Emit(EncodeOp(RX_MATCH)); @@ -2065,11 +2086,18 @@ procedure TRegExpCompiler.CompileGroup; GroupName := ParseGroupName; Inc(FCaptureCount); CaptureIdx := FCaptureCount; - Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); + // Backward capture groups still store source-order [start, end] slots. + if FBackward then + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)) + else + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); CompileDisjunction; if not Match(')') then raise EConvertError.Create('Unterminated named capture group'); - Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); + if FBackward then + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)) + else + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); end; end else if CharInSet(Peek, ['i', 'm', 's', '-']) then @@ -2083,11 +2111,18 @@ procedure TRegExpCompiler.CompileGroup; begin Inc(FCaptureCount); CaptureIdx := FCaptureCount; - Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); + // Backward capture groups still store source-order [start, end] slots. + if FBackward then + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)) + else + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)); CompileDisjunction; if not Match(')') then raise EConvertError.Create('Unterminated capturing group'); - Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); + if FBackward then + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2)) + else + Emit(EncodeOpBx(RX_SAVE, CaptureIdx * 2 + 1)); end; if FAltStackDepth > 0 then Dec(FAltStackDepth); @@ -2172,6 +2207,9 @@ procedure TRegExpCompiler.EmitBodyAt(const ABody: array of UInt32; Bx: Integer; NegFlag: Integer; begin + if ALen <= 0 then + Exit; + EnsureCodeCapacity(ALen); DstStart := FCodeLen; Move(ABody[0], FCode[DstStart], ALen * SizeOf(UInt32)); @@ -2314,10 +2352,42 @@ procedure TRegExpCompiler.CompileTerm; CompileQuantifier(AtomStart); end; +// ES2026 ยง22.2.2.3.4 MatchSequence(m1, m2, direction) procedure TRegExpCompiler.CompileAlternative; +var + TermStart: Integer; + TermCount: Integer; + TermIndex: Integer; + Terms: array of TRegExpTermCode; begin + if not FBackward then + begin + while not AtEnd and (Peek <> '|') and (Peek <> ')') do + CompileTerm; + Exit; + end; + + TermCount := 0; + SetLength(Terms, 8); while not AtEnd and (Peek <> '|') and (Peek <> ')') do + begin + TermStart := CurrentPC; CompileTerm; + if TermCount >= Length(Terms) then + SetLength(Terms, TermCount * 2 + 8); + Terms[TermCount].Length := CurrentPC - TermStart; + Terms[TermCount].OriginalStart := TermStart; + SetLength(Terms[TermCount].Code, Terms[TermCount].Length); + if Terms[TermCount].Length > 0 then + Move(FCode[TermStart], Terms[TermCount].Code[0], + Terms[TermCount].Length * SizeOf(UInt32)); + FCodeLen := TermStart; + Inc(TermCount); + end; + + for TermIndex := TermCount - 1 downto 0 do + EmitBodyAt(Terms[TermIndex].Code, Terms[TermIndex].Length, + Terms[TermIndex].OriginalStart); end; procedure TRegExpCompiler.InsertSplitAt(APos: Integer); diff --git a/source/units/Goccia.RegExp.VM.pas b/source/units/Goccia.RegExp.VM.pas index 3e0af2c0..8a3dfb83 100644 --- a/source/units/Goccia.RegExp.VM.pas +++ b/source/units/Goccia.RegExp.VM.pas @@ -32,7 +32,6 @@ implementation MIN_STEP_LIMIT = 10000000; STEPS_PER_INPUT_BYTE = 100; DEFAULT_BACKTRACK_CAP = 10000000; - MAX_LOOKBEHIND_DISTANCE = 256; MEMO_CAPACITY = 65536; MEMO_LOAD_LIMIT = 49152; HIGH_SURROGATE_START = $D800; @@ -265,6 +264,33 @@ function GetCodePointBefore(const AInput: TRegExpInput; APos: Integer; Result := True; end; +function ReadInputCodePointBefore(const AInput: TRegExpInput; APos: Integer; + const AUnicode: Boolean; out ACodePoint: Cardinal; + out AWidth: Integer): Boolean; +var + CodeUnit: Cardinal; +begin + Result := False; + ACodePoint := 0; + AWidth := 0; + if APos <= 0 then + Exit; + + CodeUnit := AInput.Units[APos - 1]; + if AUnicode and IsLowSurrogate(CodeUnit) and (APos >= 2) and + IsHighSurrogate(AInput.Units[APos - 2]) then + begin + ACodePoint := SurrogatePairToCodePoint(AInput.Units[APos - 2], CodeUnit); + AWidth := 2; + end + else + begin + ACodePoint := CodeUnit; + AWidth := 1; + end; + Result := True; +end; + function AdvanceInputIndex(const AInput: TRegExpInput; const AIndex: Integer; const AUnicode: Boolean): Integer; var @@ -294,7 +320,7 @@ function NormalizeInputIndex(const AInput: TRegExpInput; const AIndex: Integer; function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; AStartPos: Integer; var ASlots: array of Integer; ASlotCount: Integer; AStartPC: Integer = 0; - AEndPos: PInteger = nil): Boolean; + AEndPos: PInteger = nil; ABackward: Boolean = False): Boolean; var PC, InputPos: Integer; Instr: UInt32; @@ -308,7 +334,6 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; StepLimit: Integer; Memo: TMemoTable; SlotCount: Integer; - I: Integer; MatchCP: Cardinal; BeforeCP: Cardinal; BeforeIsWord, AfterIsWord: Boolean; @@ -320,6 +345,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; LookSlots: array of Integer; LookMatched: Boolean; RefStart, RefEnd, RefPos: Integer; + ComparePos: Integer; RefCP, InputCP: Cardinal; RefByteLen, InputByteLen: Integer; @@ -379,8 +405,18 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; case Op of RX_CHAR: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FullUnicode, - CodePoint, ByteLen) then + if ABackward then + begin + if not ReadInputCodePointBefore(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else if not ReadInputCodePoint(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); if not PopBacktrack then Exit; @@ -393,14 +429,27 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; if not PopBacktrack then Exit; Continue; end; - Inc(InputPos, ByteLen); + if ABackward then + Dec(InputPos, ByteLen) + else + Inc(InputPos, ByteLen); Inc(PC); end; RX_CHAR_CLASS: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FullUnicode, - CodePoint, ByteLen) then + if ABackward then + begin + if not ReadInputCodePointBefore(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else if not ReadInputCodePoint(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); if not PopBacktrack then Exit; @@ -412,14 +461,27 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; if not PopBacktrack then Exit; Continue; end; - Inc(InputPos, ByteLen); + if ABackward then + Dec(InputPos, ByteLen) + else + Inc(InputPos, ByteLen); Inc(PC); end; RX_CHAR_CLASS_NEG: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FullUnicode, - CodePoint, ByteLen) then + if ABackward then + begin + if not ReadInputCodePointBefore(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else if not ReadInputCodePoint(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); if not PopBacktrack then Exit; @@ -431,14 +493,27 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; if not PopBacktrack then Exit; Continue; end; - Inc(InputPos, ByteLen); + if ABackward then + Dec(InputPos, ByteLen) + else + Inc(InputPos, ByteLen); Inc(PC); end; RX_ANY: begin - if not ReadInputCodePoint(AInput, InputPos, AProgram.FullUnicode, - CodePoint, ByteLen) then + if ABackward then + begin + if not ReadInputCodePointBefore(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end + else if not ReadInputCodePoint(AInput, InputPos, + AProgram.FullUnicode, CodePoint, ByteLen) then begin MemoAdd(Memo, PC, InputPos); if not PopBacktrack then Exit; @@ -450,7 +525,10 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; if not PopBacktrack then Exit; Continue; end; - Inc(InputPos, ByteLen); + if ABackward then + Dec(InputPos, ByteLen) + else + Inc(InputPos, ByteLen); Inc(PC); end; @@ -513,10 +591,20 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; Inc(PC); Continue; end; + RefEnd := ASlots[BackrefGroup * 2 + 1]; + ComparePos := InputPos; + if ABackward then + begin + ComparePos := InputPos - (RefEnd - RefStart); + if ComparePos < 0 then + begin + MemoAdd(Memo, PC, InputPos); + if not PopBacktrack then Exit; + Continue; + end; + end; RefPos := RefStart; LookMatched := True; - RefEnd := ASlots[BackrefGroup * 2 + 1]; - I := InputPos; while RefPos < RefEnd do begin if not ReadInputCodePoint(AInput, RefPos, BackrefUnicode, @@ -525,7 +613,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; LookMatched := False; Break; end; - if not ReadInputCodePoint(AInput, InputPos, BackrefUnicode, + if not ReadInputCodePoint(AInput, ComparePos, BackrefUnicode, InputCP, InputByteLen) then begin LookMatched := False; @@ -546,15 +634,18 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; end; end; Inc(RefPos, RefByteLen); - Inc(InputPos, InputByteLen); + Inc(ComparePos, InputByteLen); end; if not LookMatched then begin - InputPos := I; MemoAdd(Memo, PC, InputPos); if not PopBacktrack then Exit; Continue; end; + if ABackward then + InputPos := InputPos - (RefEnd - RefStart) + else + InputPos := ComparePos; Inc(PC); end; @@ -652,7 +743,7 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; SetLength(LookSlots, SlotCount); Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, - SlotCount, PC + 1); + SlotCount, PC + 1, nil, False); if Negated then begin if LookMatched then @@ -679,27 +770,10 @@ function RunVM(const AProgram: TRegExpProgram; const AInput: TRegExpInput; begin Negated := (Bx and LOOK_NEGATED_FLAG) <> 0; LookEnd := Bx and LOOK_TARGET_MASK; - LookMatched := False; SetLength(LookSlots, SlotCount); - I := InputPos - 1; - RefStart := I - MAX_LOOKBEHIND_DISTANCE; - if RefStart < 0 then - RefStart := 0; - while I >= RefStart do - begin - Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); - RefEnd := 0; - if RunVM(AProgram, AInput, I, LookSlots, SlotCount, PC + 1, - @RefEnd) then - begin - if RefEnd = InputPos then - begin - LookMatched := True; - Break; - end; - end; - Dec(I); - end; + Move(ASlots[0], LookSlots[0], SlotCount * SizeOf(Integer)); + LookMatched := RunVM(AProgram, AInput, InputPos, LookSlots, + SlotCount, PC + 1, nil, True); if Negated then begin if LookMatched then diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 440aa58b..2a349e5f 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -402,3 +402,56 @@ test("lookbehind does not consume input", () => { expect(m[0]).toBe("b"); expect(m.index).toBe(1); }); + +test("lookbehind captures use backward matching order", () => { + const repeated = "abcdef".match(/(?<=(\w){3})def/); + expect(repeated[0]).toBe("def"); + expect(repeated[1]).toBe("a"); + + const greedy = "abbbbbbc".match(/(?<=(b+))c/); + expect(greedy[0]).toBe("c"); + expect(greedy[1]).toBe("bbbbbb"); +}); + +test("lookbehind captures are visible to backreferences inside lookbehind", () => { + const forwardReference = "ababc".match(/(?<=\1(\w+))c/); + expect(forwardReference[0]).toBe("c"); + expect(forwardReference[1]).toBe("ab"); + + const mutualReference = /(?<=a(.\2)b(\1)).{4}/.exec("aabcacbc"); + expect(mutualReference[0]).toBe("cacb"); + expect(mutualReference[1]).toBe("a"); + expect(mutualReference[2]).toBe(""); +}); + +test("negative lookbehind keeps captures unset", () => { + const result = "abcdef".match(/(? { + const nested = "abcdef".match(/(?<=a(?=([^a]{2})d)\w{3})\w\w/); + expect(nested[0]).toBe("ef"); + expect(nested[1]).toBe("bc"); + + expect("abcdef".match(/(?<=a(?=([bc]{2}(? { + expect("ab\ncd\nefg".match(/(?<=^)\w+/gm)).toEqual(["ab", "cd", "efg"]); + expect("abc def".match(/(?<=\b)[d-f]{3}/)[0]).toBe("def"); +}); + +test("lookbehind works with variable-length and sticky matches", () => { + expect("abcdef".match(/(?<=\w*)[^a|b|c]{3}/)[0]).toBe("def"); + + const sticky = /(?<=^(\w+))def/g; + const first = sticky.exec("abcdefdef"); + expect(first[0]).toBe("def"); + expect(first[1]).toBe("abc"); + + const second = sticky.exec("abcdefdef"); + expect(second[0]).toBe("def"); + expect(second[1]).toBe("abcdef"); +}); From 59f5b6d7991f3b3112c4251baaa2a763cdc50f6a Mon Sep 17 00:00:00 2001 From: Johannes Stein Date: Wed, 20 May 2026 10:00:27 +0100 Subject: [PATCH 2/2] Address RegExp lookbehind review comments - Emit unicode set string atoms backward inside lookbehind bodies. - Exercise sticky lookbehind with the sticky flag and explicit lastIndex. --- source/units/Goccia.RegExp.Compiler.pas | 12 ++++++++++-- tests/built-ins/RegExp/prototype/exec.js | 8 +++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/source/units/Goccia.RegExp.Compiler.pas b/source/units/Goccia.RegExp.Compiler.pas index 47897161..c496b923 100644 --- a/source/units/Goccia.RegExp.Compiler.pas +++ b/source/units/Goccia.RegExp.Compiler.pas @@ -1538,8 +1538,16 @@ procedure TRegExpCompiler.EmitClassContents( SplitHoles[SplitCount] := CurrentPC; Inc(SplitCount); Emit(EncodeOpBx(RX_SPLIT, 0)); - for J := 0 to High(AContents.Strings[I].CodePoints) do - EmitCharMatch(AContents.Strings[I].CodePoints[J]); + if FBackward then + begin + for J := High(AContents.Strings[I].CodePoints) downto 0 do + EmitCharMatch(AContents.Strings[I].CodePoints[J]); + end + else + begin + for J := 0 to High(AContents.Strings[I].CodePoints) do + EmitCharMatch(AContents.Strings[I].CodePoints[J]); + end; JumpHoles[JumpCount] := CurrentPC; Inc(JumpCount); Emit(EncodeOpBx(RX_JUMP, 0)); diff --git a/tests/built-ins/RegExp/prototype/exec.js b/tests/built-ins/RegExp/prototype/exec.js index 2a349e5f..50ed0c75 100644 --- a/tests/built-ins/RegExp/prototype/exec.js +++ b/tests/built-ins/RegExp/prototype/exec.js @@ -446,7 +446,8 @@ test("lookbehind supports multiline anchors and word boundaries", () => { test("lookbehind works with variable-length and sticky matches", () => { expect("abcdef".match(/(?<=\w*)[^a|b|c]{3}/)[0]).toBe("def"); - const sticky = /(?<=^(\w+))def/g; + const sticky = /(?<=^(\w+))def/y; + sticky.lastIndex = 3; const first = sticky.exec("abcdefdef"); expect(first[0]).toBe("def"); expect(first[1]).toBe("abc"); @@ -455,3 +456,8 @@ test("lookbehind works with variable-length and sticky matches", () => { expect(second[0]).toBe("def"); expect(second[1]).toBe("abcdef"); }); + +test("lookbehind unicode set strings match backward", () => { + expect(/(?<=[\q{ab}])c/v.test("abc")).toBe(true); + expect(/(?<=[\q{ab}])c/v.test("ac")).toBe(false); +});