Skip to content

Commit 899fc68

Browse files
committed
fix(chunkers): address PR review comments
- Fix regex fallback path: use sliding window for overlap instead of passing chunkOverlap to buildChunks without prepended overlap text - Fix misleading strategy label: "Text (hierarchical splitting)" → "Text (word boundary splitting)"
1 parent cb814ff commit 899fc68

2 files changed

Lines changed: 5 additions & 3 deletions

File tree

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ interface CreateBaseModalProps {
4040

4141
const STRATEGY_OPTIONS = [
4242
{ value: 'auto', label: 'Auto (detect from content)' },
43-
{ value: 'text', label: 'Text (hierarchical splitting)' },
43+
{ value: 'text', label: 'Text (word boundary splitting)' },
4444
{ value: 'recursive', label: 'Recursive (configurable separators)' },
4545
{ value: 'sentence', label: 'Sentence' },
4646
{ value: 'token', label: 'Token (fixed-size)' },

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,10 @@ export class RegexChunker {
8686
if (segments.length <= 1) {
8787
logger.warn('Regex pattern did not produce any splits, falling back to character splitting')
8888
const chunkSizeChars = tokensToChars(this.chunkSize)
89-
const chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
90-
return buildChunks(chunks, this.chunkOverlap)
89+
const overlapChars = tokensToChars(this.chunkOverlap)
90+
const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
91+
const chunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
92+
return buildChunks(chunks, 0)
9193
}
9294

9395
const merged = this.mergeSegments(segments)

0 commit comments

Comments
 (0)