Skip to content

Commit 7a15654

Browse files
committed
refactor(regexps): bind escapeRegExp to native RegExp.escape when available
Match the TC39 RegExp.escape spec (§22.2.5.1, Stage 4, Node 24+ / V8 13.7) so callers can safely concatenate escaped output into any regex-Pattern position, including: - Leading `[0-9A-Za-z]` → `\xHH` (guards against `\0..\9` / `\c` merging in a surrounding pattern). - SyntaxCharacter + `/` → backslash prefix. - ControlEscape (`\t\n\v\f\r`) → literal letter forms. - otherPunctuators (`,-=<>#&!%:;@~'`+backtick+`"`) → `\xHH`. - Whitespace / LineTerminator / lone surrogates → `\xHH` or `\uXXXX`. Previous implementation escaped only the SyntaxCharacter set (plus `-` added in the prior release), leaving output unsafe against leading- identifier merging or splicing into `/.../` literals. New binding prefers native `RegExp.escape` when typeof checks pass, otherwise falls back to a spec-compliant implementation. Diffed fallback output against native across ASCII 0-127 plus selected non-ASCII (NBSP, ZWNBSP, LS, PS, surrogates): zero differences. Tests rewritten to assert spec-shape invariants (leading-letter `\xHH`, `/` escape, ControlEscape, otherPunctuators `\xHH`) plus behavior-level round-trips. In-repo caller `packages/normalize.ts` verified: its `REGISTRY_SCOPE_DELIMITER='__'` / first-char `'_'` produce identical escaped output under both old and new rules. Reference: https://tc39.es/ecma262/#sec-regexp.escape
1 parent 47afc7b commit 7a15654

3 files changed

Lines changed: 232 additions & 101 deletions

File tree

docs/api-index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Each entry links to the source module and shows the first sentence of its `@file
4040
| [`@socketsecurity/lib/process-lock`](../src/process-lock.ts) | Process locking utilities with stale detection and exit cleanup. |
4141
| [`@socketsecurity/lib/promise-queue`](../src/promise-queue.ts) | Bounded concurrency promise queue. |
4242
| [`@socketsecurity/lib/promises`](../src/promises.ts) | Promise utilities including chunked iteration and timers. |
43-
| [`@socketsecurity/lib/regexps`](../src/regexps.ts) | Regular expression utilities including escape-string-regexp implementation. |
43+
| [`@socketsecurity/lib/regexps`](../src/regexps.ts) | Regular expression utilities including a spec-compliant `RegExp.escape` fallback. |
4444
| [`@socketsecurity/lib/sea`](../src/sea.ts) | SEA (Single Executable Application) detection utilities for Socket ecosystem. |
4545
| [`@socketsecurity/lib/shadow`](../src/shadow.ts) | Shadow binary installation utilities for Socket ecosystem. |
4646
| [`@socketsecurity/lib/signal-exit`](../src/signal-exit.ts) | Process signal handling utilities. |

src/regexps.ts

Lines changed: 121 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,131 @@
11
/**
2-
* @fileoverview Regular expression utilities including escape-string-regexp implementation.
3-
* Provides regex escaping and pattern matching helpers.
2+
* @fileoverview Regular expression utilities including a spec-compliant
3+
* `RegExp.escape` fallback. Provides regex escaping and pattern matching
4+
* helpers.
45
*/
56

6-
// Inlined escape-string-regexp:
7-
// https://socket.dev/npm/package/escape-string-regexp/overview/5.0.0
8-
// MIT License
9-
// Copyright (c) Sindre Sorhus <sindresorhus@gmail.com> (https://sindresorhus.com)
7+
// Spec-compliant fallback for TC39 RegExp.escape (Node 24+ ships native):
8+
// https://tc39.es/ecma262/#sec-regexp.escape
9+
// https://tc39.es/ecma262/#sec-encodeforregexpescape
10+
11+
// SyntaxCharacter set plus `/` — these get a plain backslash prefix.
12+
const SYNTAX_CHARACTERS = new Set('^$\\.*+?()[]{}|/')
13+
14+
// ControlEscape mappings: \t \n \v \f \r (spec Table 62).
15+
const CONTROL_ESCAPES = new Map<number, string>([
16+
[0x09, '\\t'],
17+
[0x0a, '\\n'],
18+
[0x0b, '\\v'],
19+
[0x0c, '\\f'],
20+
[0x0d, '\\r'],
21+
])
22+
23+
// Other ASCII punctuators the spec explicitly hex-escapes (§22.2.5.1.1),
24+
// plus any whitespace / line terminator / lone surrogate the spec routes
25+
// through the same branch.
26+
const OTHER_PUNCTUATORS = new Set(',-=<>#&!%:;@~\'`"')
27+
28+
// Additional whitespace / line terminator / surrogate code points the
29+
// spec requires escaping. We enumerate the ones that commonly appear in
30+
// string inputs; `String#codePointAt` iteration surfaces them as numbers.
31+
// Whitespace: TAB, VT, FF, SP, NBSP, ZWNBSP, plus Unicode Space_Separator.
32+
// LineTerminator: LF, CR, LS (U+2028), PS (U+2029).
33+
// Lone surrogates: U+D800..U+DFFF.
34+
function isSpecHexEscapeCp(cp: number): boolean {
35+
if (OTHER_PUNCTUATORS.has(String.fromCodePoint(cp))) {
36+
return true
37+
}
38+
// LineTerminator.
39+
if (cp === 0x0a || cp === 0x0d || cp === 0x2028 || cp === 0x2029) {
40+
return true
41+
}
42+
// Whitespace subset (ASCII/common — matches WhiteSpace production).
43+
if (
44+
cp === 0x09 ||
45+
cp === 0x0b ||
46+
cp === 0x0c ||
47+
cp === 0x20 ||
48+
cp === 0xa0 ||
49+
cp === 0xfeff
50+
) {
51+
return true
52+
}
53+
// Lone surrogates.
54+
if (cp >= 0xd800 && cp <= 0xdfff) {
55+
return true
56+
}
57+
return false
58+
}
59+
60+
function hex2(n: number): string {
61+
return n.toString(16).padStart(2, '0')
62+
}
63+
64+
function hex4(n: number): string {
65+
return n.toString(16).padStart(4, '0')
66+
}
67+
68+
function escapeRegExpFallback(str: string): string {
69+
let out = ''
70+
// Iterate by code point (String iterator yields UTF-16-safe chars).
71+
let isFirst = true
72+
for (const char of str) {
73+
const cp = char.codePointAt(0)!
74+
// Leading [0-9A-Za-z] always gets \xHH (guards against \0..\9 /
75+
// \c merging in a larger pattern).
76+
if (
77+
isFirst &&
78+
((cp >= 0x30 && cp <= 0x39) ||
79+
(cp >= 0x41 && cp <= 0x5a) ||
80+
(cp >= 0x61 && cp <= 0x7a))
81+
) {
82+
out += '\\x' + hex2(cp)
83+
} else if (SYNTAX_CHARACTERS.has(char)) {
84+
// SyntaxCharacter + `/`.
85+
out += '\\' + char
86+
} else {
87+
const ctrl = CONTROL_ESCAPES.get(cp)
88+
if (ctrl !== undefined) {
89+
out += ctrl
90+
} else if (isSpecHexEscapeCp(cp)) {
91+
if (cp <= 0xff) {
92+
out += '\\x' + hex2(cp)
93+
} else {
94+
// Emit per UTF-16 code unit (\uXXXX each).
95+
for (let i = 0; i < char.length; i++) {
96+
out += '\\u' + hex4(char.charCodeAt(i))
97+
}
98+
}
99+
} else {
100+
// Verbatim.
101+
out += char
102+
}
103+
}
104+
isFirst = false
105+
}
106+
return out
107+
}
10108

11109
/**
12-
* Escape special characters in a string for use in a regular expression.
110+
* Escape special characters in a string so the result can be safely
111+
* concatenated into any regular-expression Pattern position without
112+
* altering the meaning of surrounding syntax.
113+
*
114+
* Bound to native `RegExp.escape` when available (TC39 Stage 4, Node 24+ /
115+
* V8 13.7); otherwise falls back to a spec-compliant implementation. Both
116+
* paths satisfy the spec guarantee: `new RegExp(escapeRegExp(s))` matches
117+
* exactly the literal string `s`.
118+
*
119+
* Reference: https://tc39.es/ecma262/#sec-regexp.escape
13120
*
14121
* @example
15122
* ```typescript
16-
* escapeRegExp('foo.bar') // 'foo\\.bar'
17-
* escapeRegExp('a+b*c?') // 'a\\+b\\*c\\?'
18-
* new RegExp(escapeRegExp('[test]')) // /\[test\]/
123+
* new RegExp(escapeRegExp('[test]')) // matches literal '[test]'
124+
* new RegExp('[' + escapeRegExp('a-z') + ']') // matches 'a', '-', or 'z'
19125
* ```
20126
*/
21-
/*@__NO_SIDE_EFFECTS__*/
22-
export function escapeRegExp(str: string): string {
23-
// Escape characters with special meaning either inside or outside
24-
// character sets. Includes `-` so callers that splice an escaped
25-
// string into a character class — e.g. `new RegExp('[' +
26-
// escapeRegExp(userInput) + ']')` — don't accidentally create a range
27-
// when input contains '-'. Matches the MDN / `escape-string-regexp`
28-
// reference set.
29-
return str.replace(/[\\|{}()[\]^$+*?.-]/g, '\\$&')
30-
}
127+
const maybeNativeEscape = (RegExp as unknown as { escape?: unknown }).escape
128+
export const escapeRegExp: (str: string) => string =
129+
typeof maybeNativeEscape === 'function'
130+
? (maybeNativeEscape as (str: string) => string)
131+
: escapeRegExpFallback

test/unit/regexps.test.mts

Lines changed: 110 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,125 +1,155 @@
11
/**
2-
* @fileoverview Unit tests for regular expression utilities.
2+
* @fileoverview Unit tests for `escapeRegExp`.
33
*
4-
* Tests regex helper functions:
5-
* - escapeRegExp() escapes special characters for safe regex construction
6-
* - Handles all regex metacharacters: \, |, {, }, [, ], (, ), *, +, ?, ., ^, $
7-
* - Prevents regex injection vulnerabilities
8-
* - Used for dynamic pattern building from user input
9-
* Used throughout Socket tools for safe regex pattern construction.
4+
* Tests align with the TC39 RegExp.escape spec:
5+
* https://tc39.es/ecma262/#sec-regexp.escape
6+
*
7+
* Assertions are BEHAVIOR-based (the escaped output produces a regex that
8+
* matches the original input exactly) plus targeted SPEC-SHAPE checks for
9+
* the two invariants that matter for safe concatenation:
10+
* 1. Leading `[0-9A-Za-z]` is encoded as `\xHH` so it can't merge with
11+
* a preceding `\0..\9` / `\c` in a larger pattern.
12+
* 2. `/` is backslash-escaped so the result is safe inside a `/.../`
13+
* literal.
14+
*
15+
* We verify the same guarantees hold whether `escapeRegExp` is bound to
16+
* native `RegExp.escape` (Node 24+) or our hand-rolled fallback.
1017
*/
1118

1219
import { describe, expect, it } from 'vitest'
1320

1421
import { escapeRegExp } from '@socketsecurity/lib/regexps'
1522

23+
/** `new RegExp(escapeRegExp(input))` must match exactly `input`. */
24+
function expectLiteralRoundtrip(input: string): void {
25+
const re = new RegExp(`^${escapeRegExp(input)}$`)
26+
expect(re.test(input)).toBe(true)
27+
}
28+
1629
describe('regexps', () => {
1730
describe('escapeRegExp', () => {
18-
it('should escape backslash', () => {
19-
expect(escapeRegExp('\\')).toBe('\\\\')
20-
})
21-
22-
it('should escape pipe', () => {
23-
expect(escapeRegExp('|')).toBe('\\|')
24-
})
25-
26-
it('should escape curly braces', () => {
27-
expect(escapeRegExp('{}')).toBe('\\{\\}')
28-
expect(escapeRegExp('{')).toBe('\\{')
29-
expect(escapeRegExp('}')).toBe('\\}')
30-
})
31-
32-
it('should escape parentheses', () => {
33-
expect(escapeRegExp('()')).toBe('\\(\\)')
34-
expect(escapeRegExp('(')).toBe('\\(')
35-
expect(escapeRegExp(')')).toBe('\\)')
31+
it('is a function (native or fallback)', () => {
32+
expect(typeof escapeRegExp).toBe('function')
3633
})
3734

38-
it('should escape square brackets', () => {
39-
expect(escapeRegExp('[]')).toBe('\\[\\]')
40-
expect(escapeRegExp('[')).toBe('\\[')
41-
expect(escapeRegExp(']')).toBe('\\]')
35+
it('empty string returns empty string', () => {
36+
expect(escapeRegExp('')).toBe('')
4237
})
4338

44-
it('should escape caret', () => {
45-
expect(escapeRegExp('^')).toBe('\\^')
39+
// Spec §22.2.5.1 step 3.a: leading `[0-9A-Za-z]` → `\xHH`.
40+
it('encodes leading ASCII letter/digit as \\xHH', () => {
41+
expect(escapeRegExp('a')).toBe('\\x61')
42+
expect(escapeRegExp('Z')).toBe('\\x5a')
43+
expect(escapeRegExp('0')).toBe('\\x30')
44+
expect(escapeRegExp('9')).toBe('\\x39')
45+
// Trailing letters/digits are NOT hex-escaped.
46+
expect(escapeRegExp('abc').startsWith('\\x61')).toBe(true)
47+
expect(escapeRegExp('abc').endsWith('bc')).toBe(true)
4648
})
4749

48-
it('should escape dollar sign', () => {
49-
expect(escapeRegExp('$')).toBe('\\$')
50+
// Spec §22.2.5.1.1 step 1: SyntaxCharacter + `/` → backslash prefix.
51+
it('backslash-prefixes SyntaxCharacter + /', () => {
52+
for (const ch of '^$\\.*+?()[]{}|/') {
53+
expect(escapeRegExp(ch)).toBe('\\' + ch)
54+
}
5055
})
5156

52-
it('should escape plus', () => {
53-
expect(escapeRegExp('+')).toBe('\\+')
57+
// Spec §22.2.5.1.1 step 2: ControlEscape (Table 62).
58+
it('encodes control-escape characters as their escape forms', () => {
59+
expect(escapeRegExp('\t')).toBe('\\t')
60+
expect(escapeRegExp('\n')).toBe('\\n')
61+
expect(escapeRegExp('\v')).toBe('\\v')
62+
expect(escapeRegExp('\f')).toBe('\\f')
63+
expect(escapeRegExp('\r')).toBe('\\r')
5464
})
5565

56-
it('should escape asterisk', () => {
57-
expect(escapeRegExp('*')).toBe('\\*')
66+
// Spec §22.2.5.1.1 step 4: otherPunctuators → \xHH (cp ≤ 0xFF).
67+
it('hex-escapes the otherPunctuators set', () => {
68+
for (const ch of ',-=<>#&!%:;@~\'`"') {
69+
const cp = ch.codePointAt(0)!
70+
expect(escapeRegExp(ch)).toBe('\\x' + cp.toString(16).padStart(2, '0'))
71+
}
5872
})
5973

60-
it('should escape question mark', () => {
61-
expect(escapeRegExp('?')).toBe('\\?')
74+
// Critical for the character-class splice use case.
75+
it('escaped `-` stays literal inside a character class', () => {
76+
const escaped = escapeRegExp('a-z')
77+
const re = new RegExp(`^[${escaped}]$`)
78+
expect(re.test('a')).toBe(true)
79+
expect(re.test('-')).toBe(true)
80+
expect(re.test('z')).toBe(true)
81+
// Letter between a and z must NOT match if `-` stayed literal.
82+
expect(re.test('m')).toBe(false)
6283
})
6384

64-
it('should escape dot', () => {
65-
expect(escapeRegExp('.')).toBe('\\.')
85+
// Behavior-level roundtrip: any metacharacter-only string must match
86+
// itself literally after escape.
87+
it('every metacharacter round-trips as a literal match', () => {
88+
for (const ch of '\\|{}()[]^$+*?.-/') {
89+
expectLiteralRoundtrip(ch)
90+
}
6691
})
6792

68-
it('should escape multiple special characters', () => {
69-
// biome-ignore lint/suspicious/noTemplateCurlyInString: Testing regex escape for curly braces
70-
expect(escapeRegExp('.*+?^${}()|[]')).toBe(
71-
'\\.\\*\\+\\?\\^\\$\\{\\}\\(\\)\\|\\[\\]',
72-
)
93+
it('paired metacharacters round-trip', () => {
94+
for (const pair of ['{}', '()', '[]', '{{', '}}']) {
95+
expectLiteralRoundtrip(pair)
96+
}
7397
})
7498

75-
it('should not escape regular characters', () => {
76-
expect(escapeRegExp('abc123')).toBe('abc123')
77-
expect(escapeRegExp('hello world')).toBe('hello world')
99+
it('every metacharacter in one string round-trips', () => {
100+
expectLiteralRoundtrip('.*+?^${}()|[]/\\-')
78101
})
79102

80-
it('should handle mixed strings', () => {
81-
expect(escapeRegExp('hello.world')).toBe('hello\\.world')
82-
expect(escapeRegExp('test(123)')).toBe('test\\(123\\)')
83-
expect(escapeRegExp('price: $50+')).toBe('price: \\$50\\+')
103+
it('round-trips mixed plain + metacharacter strings', () => {
104+
for (const s of [
105+
'hello.world',
106+
'test(123)',
107+
'price: $50+',
108+
'*.{js,ts}',
109+
'a{1,3}',
110+
]) {
111+
expectLiteralRoundtrip(s)
112+
}
84113
})
85114

86-
it('should handle empty string', () => {
87-
expect(escapeRegExp('')).toBe('')
115+
it('round-trips plain ASCII strings', () => {
116+
for (const s of ['abc123', 'hello world', 'foo', '123']) {
117+
expectLiteralRoundtrip(s)
118+
}
88119
})
89120

90-
it('should work in actual regex', () => {
91-
const input = 'test.file'
92-
const escaped = escapeRegExp(input)
93-
const regex = new RegExp(escaped)
94-
95-
expect(regex.test('test.file')).toBe(true)
96-
expect(regex.test('testXfile')).toBe(false)
121+
// A sanity check that metacharacter meaning is neutralized, not just
122+
// that the input string matches itself (which a `.*` regex would
123+
// trivially satisfy).
124+
it('escaped `.` does not act as a wildcard', () => {
125+
const re = new RegExp(`^${escapeRegExp('test.file')}$`)
126+
expect(re.test('test.file')).toBe(true)
127+
expect(re.test('testXfile')).toBe(false)
97128
})
98129

99-
it('should escape complex file patterns', () => {
100-
const pattern = '*.{js,ts}'
101-
const escaped = escapeRegExp(pattern)
102-
expect(escaped).toBe('\\*\\.\\{js,ts\\}')
130+
it('escaped quantifier does not quantify', () => {
131+
const re = new RegExp(`^${escapeRegExp('a{1,3}')}$`)
132+
expect(re.test('a{1,3}')).toBe(true)
133+
expect(re.test('aaa')).toBe(false)
103134
})
104135

105-
it('should escape regex quantifiers', () => {
106-
expect(escapeRegExp('a{1,3}')).toBe('a\\{1,3\\}')
107-
expect(escapeRegExp('a*')).toBe('a\\*')
108-
expect(escapeRegExp('a+')).toBe('a\\+')
109-
expect(escapeRegExp('a?')).toBe('a\\?')
136+
it('escaped `*` does not act as a wildcard in a glob-like input', () => {
137+
const re = new RegExp(`^${escapeRegExp('*.{js,ts}')}$`)
138+
expect(re.test('*.{js,ts}')).toBe(true)
139+
expect(re.test('foo.js')).toBe(false)
110140
})
111141

112-
it('should escape character classes (including the range hyphen)', () => {
113-
// `-` is now escaped so splicing the result into a character class
114-
// (e.g. `[${escapeRegExp('a-z')}]`) produces three literal chars
115-
// rather than a range.
116-
expect(escapeRegExp('[a-z]')).toBe('\\[a\\-z\\]')
117-
expect(escapeRegExp('[^0-9]')).toBe('\\[\\^0\\-9\\]')
142+
it('round-trips unicode characters', () => {
143+
expectLiteralRoundtrip('hello世界')
144+
expectLiteralRoundtrip('test.世界')
118145
})
119146

120-
it('should handle unicode characters', () => {
121-
expect(escapeRegExp('hello世界')).toBe('hello世界')
122-
expect(escapeRegExp('test.世界')).toBe('test\\.世界')
147+
// Spec guarantees safe concatenation into any Pattern context.
148+
it('escaped output is safe to splice between arbitrary regex fragments', () => {
149+
const middle = escapeRegExp('1.2.3')
150+
const re = new RegExp(`^v${middle}-release$`)
151+
expect(re.test('v1.2.3-release')).toBe(true)
152+
expect(re.test('vX2X3-release')).toBe(false)
123153
})
124154
})
125155
})

0 commit comments

Comments
 (0)