diff --git a/.changeset/dynamic-endpointing-node-port.md b/.changeset/dynamic-endpointing-node-port.md new file mode 100644 index 000000000..58278fb9e --- /dev/null +++ b/.changeset/dynamic-endpointing-node-port.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": patch +--- + +feat: Add dynamic endpointing for voice turn handling diff --git a/agents/src/utils.ts b/agents/src/utils.ts index 82c623a6c..3bbd6d223 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -352,43 +352,97 @@ export class AsyncIterableQueue implements AsyncIterableIterator { /** @internal */ export class ExpFilter { - #alpha: number; - #max?: number; - #filtered?: number = undefined; + private _alpha: number; + private _filtered: number | undefined; + private _maxVal: number | undefined; + private _minVal: number | undefined; + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 5-20 lines + constructor(alpha: number, maxVal?: number, minVal?: number, initial?: number) { + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } - constructor(alpha: number, max?: number) { - this.#alpha = alpha; - this.#max = max; + this._alpha = alpha; + this._filtered = initial; + this._maxVal = maxVal; + this._minVal = minVal; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 21-37 lines + reset({ + alpha, + initial, + minVal, + maxVal, + }: { + alpha?: number; + initial?: number; + minVal?: number; + maxVal?: number; + } = {}): void { + if (alpha !== undefined) { + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } + this._alpha = alpha; + } + if (initial !== undefined) { + this._filtered = initial; + } + if (minVal !== undefined) { + this._minVal = minVal; + } + if (maxVal !== undefined) { + this._maxVal = maxVal; + } } - reset(alpha?: number) { - if (alpha) { - this.#alpha = alpha; + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 38-57 lines + apply(exp: number, sample?: number): number { + sample ??= this._filtered; + + if (sample !== undefined && this._filtered === undefined) { + this._filtered = sample; + } else if (sample !== undefined && this._filtered !== undefined) { + const a = this._alpha ** exp; + this._filtered = a * this._filtered + (1 - a) * sample; } - this.#filtered = undefined; - } - apply(exp: number, sample: number): number { - if (this.#filtered) { - const a = this.#alpha ** exp; - this.#filtered = a * this.#filtered + (1 - a) * sample; - } else { - this.#filtered = sample; + if (this._filtered === undefined) { + throw new Error('sample or initial value must be given.'); + } + + if (this._maxVal !== undefined && this._filtered > this._maxVal) { + this._filtered = this._maxVal; } - if (this.#max && this.#filtered > this.#max) { - this.#filtered = this.#max; + if (this._minVal !== undefined && this._filtered < this._minVal) { + this._filtered = this._minVal; } - return this.#filtered; + return this._filtered; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 59-61 lines + get value(): number | undefined { + return this._filtered; } get filtered(): number | undefined { - return this.#filtered; + return this.value; } set alpha(alpha: number) { - this.#alpha = alpha; + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } + this._alpha = alpha; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 63-64 lines + updateBase(alpha: number): void { + this._alpha = alpha; } } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index ab329d7f0..c59a08983 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -65,6 +65,7 @@ import { type RecognitionHooks, type STTPipeline, } from './audio_recognition.js'; +import { createEndpointing } from './endpointing.js'; import type { AgentState } from './events.js'; import { AgentSessionEventTypes, @@ -90,6 +91,8 @@ import { } from './generation.js'; import type { TimedString } from './io.js'; import { SpeechHandle } from './speech_handle.js'; +import type { EndpointingOptions } from './turn_config/endpointing.js'; +import { stripUndefined } from './turn_config/utils.js'; import { setParticipantSpanAttributes } from './utils.js'; export const agentActivityStorage = new AsyncLocalStorage(); @@ -195,6 +198,7 @@ export class AgentActivity implements RecognitionHooks { private isInterruptionDetectionEnabled: boolean; private isInterruptionByAudioActivityEnabled: boolean; private isDefaultInterruptionByAudioActivityEnabled: boolean; + private interruptionDetected = false; // for false interruption handling private pausedSpeech?: PausedSpeechInfo; @@ -218,6 +222,8 @@ export class AgentActivity implements RecognitionHooks { this.onError(ev); private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1503-1508 lines + this.interruptionDetected = ev.isInterruption; this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev); }; @@ -489,12 +495,8 @@ export class AgentActivity implements RecognitionHooks { turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, interruptionDetection: this.interruptionDetector, - minEndpointingDelay: - this.agent.turnHandling?.endpointing?.minDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.minDelay, - maxEndpointingDelay: - this.agent.turnHandling?.endpointing?.maxDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 779-789 lines + endpointing: createEndpointing(this.endpointingOptions), rootSpanContext: this.agentSession.rootSpanContext, sttModel: this.stt?.label, sttProvider: this.getSttProvider(), @@ -673,19 +675,13 @@ export class AgentActivity implements RecognitionHooks { return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling; } - // get minEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.minDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay - // ); - // } - - // get maxEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.maxDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay - // ); - // } + private get endpointingOptions(): EndpointingOptions { + const agentEndpointing = this.agent.turnHandling?.endpointing; + return { + ...this.agentSession.sessionOptions.turnHandling.endpointing, + ...(agentEndpointing ? stripUndefined(agentEndpointing) : {}), + }; + } get toolCtx(): ToolContext { return this.agent.toolCtx; @@ -730,9 +726,11 @@ export class AgentActivity implements RecognitionHooks { updateOptions({ toolChoice, turnDetection, + endpointing, }: { toolChoice?: ToolChoice | null; turnDetection?: TurnDetectionMode; + endpointing?: EndpointingOptions; }): void { if (toolChoice !== undefined) { this.toolChoice = toolChoice; @@ -755,7 +753,11 @@ export class AgentActivity implements RecognitionHooks { } if (this.audioRecognition) { - this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode }); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 487-493 lines + this.audioRecognition.updateOptions({ + endpointing: endpointing !== undefined ? createEndpointing(endpointing) : undefined, + turnDetection: this.turnDetectionMode, + }); } } @@ -934,12 +936,9 @@ export class AgentActivity implements RecognitionHooks { if (!this.vad) { this.agentSession._updateUserState('speaking'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfOverlapSpeech( - 0, - Date.now(), - this.agentSession._userSpeakingSpan, - ); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1510-1517 lines + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); } } @@ -959,8 +958,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1528-1535 lines + this.audioRecognition.onEndOfSpeech(Date.now(), this.agentSession._userSpeakingSpan); } this.agentSession._updateUserState('listening'); } @@ -1042,14 +1042,17 @@ export class AgentActivity implements RecognitionHooks { lastSpeakingTime: speechStartTime, otelContext: otelContext.active(), }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { // Pass speechStartTime as the absolute startedAt timestamp. - this.audioRecognition.onStartOfOverlapSpeech( - ev.speechDuration, + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1662-1673 lines + this.audioRecognition.onStartOfSpeech( speechStartTime, + ev.speechDuration, this.agentSession._userSpeakingSpan, ); } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1674-1676 lines + this.interruptionDetected = false; if (this.falseInterruptionTimer) { // cancel the timer when user starts speaking but leave the paused state unchanged @@ -1080,11 +1083,13 @@ export class AgentActivity implements RecognitionHooks { // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency. speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration; } - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { // Pass speechEndTime as the absolute endedAt timestamp. - this.audioRecognition.onEndOfOverlapSpeech( + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1698-1712 lines + this.audioRecognition.onEndOfSpeech( speechEndTime, this.agentSession._userSpeakingSpan, + this.isInterruptionDetectionEnabled ? this.interruptionDetected : undefined, ); } this.agentSession._updateUserState('listening', { @@ -1167,15 +1172,12 @@ export class AgentActivity implements RecognitionHooks { const audioOutput = this.agentSession.output.audio; if ( - this.isInterruptionDetectionEnabled && this.audioRecognition && + !this.audioRecognition.endpointingOverlapping && this.agentSession.agentState === 'speaking' ) { - this.audioRecognition.onStartOfOverlapSpeech( - 0, - Date.now(), - this.agentSession._userSpeakingSpan, - ); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1631-1639 lines + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); } this.updatePausedSpeech(this._currentSpeech, timeout); @@ -1943,8 +1945,11 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2223-2231 lines + this.audioRecognition.onStartOfAgentSpeech(replyStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -2036,10 +2041,12 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); } - this.restoreInterruptionByAudioActivity(); + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); + } } } @@ -2227,8 +2234,11 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2579-2588 lines + this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -2390,8 +2400,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { this.restoreInterruptionByAudioActivity(); } } @@ -2433,11 +2445,11 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - { - this.audioRecognition.onEndOfAgentSpeech(Date.now()); - this.restoreInterruptionByAudioActivity(); - } + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); } } @@ -3390,7 +3402,8 @@ export class AgentActivity implements RecognitionHooks { otelContext: this.pausedSpeech.handle._agentTurnContext, }); if (this.audioRecognition && this.pausedSpeech.agentState === 'speaking') { - this.audioRecognition.onStartOfAgentSpeech(); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 3479-3486 lines + this.audioRecognition.onStartOfAgentSpeech(Date.now()); } if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index b1e97c3e6..f4e07b787 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -35,6 +35,7 @@ import { traceTypes, tracer } from '../telemetry/index.js'; import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; +import type { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; import { setParticipantSpanAttributes } from './utils.js'; @@ -138,10 +139,8 @@ export interface AudioRecognitionOptions { /** Turn detection mode. */ turnDetectionMode?: TurnDetectionMode; interruptionDetection?: AdaptiveInterruptionDetector; - /** Minimum endpointing delay in milliseconds. */ - minEndpointingDelay: number; - /** Maximum endpointing delay in milliseconds. */ - maxEndpointingDelay: number; + /** Endpointing state used to select the end-of-turn delay. */ + endpointing: BaseEndpointing; /** Root span context for tracing. */ rootSpanContext?: Context; /** STT model name for tracing */ @@ -170,8 +169,7 @@ export class AudioRecognition { private vad?: VAD; private turnDetector?: _TurnDetector; private turnDetectionMode?: TurnDetectionMode; - private minEndpointingDelay: number; - private maxEndpointingDelay: number; + private endpointing: BaseEndpointing; private lastLanguage?: LanguageCode; private rootSpanContext?: Context; private sttModel?: string; @@ -228,8 +226,7 @@ export class AudioRecognition { this.vad = opts.vad; this.turnDetector = opts.turnDetector; this.turnDetectionMode = opts.turnDetectionMode; - this.minEndpointingDelay = opts.minEndpointingDelay; - this.maxEndpointingDelay = opts.maxEndpointingDelay; + this.endpointing = opts.endpointing; this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; this.sttModel = opts.sttModel; @@ -278,9 +275,24 @@ export class AudioRecognition { return this._inputStartedAt; } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1631-1639 lines + get endpointingOverlapping(): boolean { + return this.endpointing.overlapping; + } + /** @internal */ - updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void { - this.turnDetectionMode = options.turnDetection; + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 193-219 lines + updateOptions(options: { + endpointing?: BaseEndpointing; + turnDetection?: TurnDetectionMode | undefined; + }): void { + if (options.endpointing !== undefined) { + this.endpointing = options.endpointing; + } + + if (Object.hasOwn(options, 'turnDetection')) { + this.turnDetectionMode = options.turnDetection; + } } async start(options?: { sttPipeline?: STTPipeline }) { @@ -315,12 +327,19 @@ export class AudioRecognition { this.interruptionStreamChannel = undefined; } - async onStartOfAgentSpeech() { + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 239-244 lines + async onStartOfAgentSpeech(startedAt: number) { this.isAgentSpeaking = true; + this.endpointing.onStartOfAgentSpeech(startedAt); return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); } + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 246-271 lines async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (this.isAgentSpeaking) { + this.endpointing.onEndOfAgentSpeech(Date.now()); + } + if (!this.isInterruptionEnabled) { this.isAgentSpeaking = false; return; @@ -348,6 +367,27 @@ export class AudioRecognition { this.isAgentSpeaking = false; } + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 273-290 lines + async onStartOfSpeech(startedAt: number, speechDuration = 0, userSpeakingSpan?: Span) { + this.endpointing.onStartOfSpeech(startedAt, this.isAgentSpeaking); + if (!this.isInterruptionEnabled || !this.isAgentSpeaking) { + return; + } + return this.onStartOfOverlapSpeech(speechDuration, startedAt, userSpeakingSpan); + } + + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 292-306 lines + async onEndOfSpeech(endedAt: number, userSpeakingSpan?: Span, interruption?: boolean) { + if (this.speaking) { + this.endpointing.onEndOfSpeech( + endedAt, + interruption !== undefined && !interruption && this.isAgentSpeaking, + ); + } + + return this.onEndOfOverlapSpeech(endedAt, userSpeakingSpan); + } + /** Start interruption inference when agent is speaking and overlap speech starts. */ async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) { if (this.isAgentSpeaking) { @@ -825,7 +865,8 @@ export class AudioRecognition { speechStartTime: number | undefined, ) => async (controller: AbortController) => { - let endpointingDelay = this.minEndpointingDelay; + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 949-973 lines + let endpointingDelay = this.endpointing.minDelay; const userTurnSpan = this.ensureUserTurnSpan(); const userTurnCtx = this.userTurnContext(userTurnSpan); @@ -851,7 +892,7 @@ export class AudioRecognition { ); if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) { - endpointingDelay = this.maxEndpointingDelay; + endpointingDelay = this.endpointing.maxDelay; } } catch (error) { this.logger.error(error, 'Error predicting end of turn'); diff --git a/agents/src/voice/audio_recognition_handoff.test.ts b/agents/src/voice/audio_recognition_handoff.test.ts index 76311ec12..204f50462 100644 --- a/agents/src/voice/audio_recognition_handoff.test.ts +++ b/agents/src/voice/audio_recognition_handoff.test.ts @@ -7,6 +7,7 @@ import { ChatContext } from '../llm/chat_context.js'; import { initializeLogger } from '../log.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { AudioRecognition, type RecognitionHooks, STTPipeline } from './audio_recognition.js'; +import { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; function createHooks() { @@ -45,8 +46,7 @@ function createRecognition(sttNode: STTNode, hooks = createHooks()) { recognition: new AudioRecognition({ recognitionHooks: hooks, stt: sttNode, - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), }), }; } diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts index cfe92a821..56ae77646 100644 --- a/agents/src/voice/audio_recognition_span.test.ts +++ b/agents/src/voice/audio_recognition_span.test.ts @@ -22,6 +22,7 @@ import { type RecognitionHooks, type _TurnDetector, } from './audio_recognition.js'; +import { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; function setupInMemoryTracing() { @@ -145,8 +146,7 @@ describe('AudioRecognition user_turn span parity', () => { vad: undefined, turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'stt', - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), sttModel: 'deepgram-nova2', sttProvider: 'deepgram', getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }), @@ -254,8 +254,7 @@ describe('AudioRecognition user_turn span parity', () => { vad: new FakeVAD(vadEvents), turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'vad', - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), sttModel: 'stt-model', sttProvider: 'stt-provider', getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }), diff --git a/agents/src/voice/endpointing.test.ts b/agents/src/voice/endpointing.test.ts new file mode 100644 index 000000000..61b0c5fdd --- /dev/null +++ b/agents/src/voice/endpointing.test.ts @@ -0,0 +1,566 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import { ExpFilter } from '../utils.js'; +import { AgentActivity } from './agent_activity.js'; +import { AudioRecognition, type RecognitionHooks } from './audio_recognition.js'; +import { DynamicEndpointing, createEndpointing } from './endpointing.js'; + +function privateState(value: object): Record { + return value as Record; +} + +function createHooks(): RecognitionHooks { + return { + onInterruption: vi.fn(), + onStartOfSpeech: vi.fn(), + onVADInferenceDone: vi.fn(), + onEndOfSpeech: vi.fn(), + onInterimTranscript: vi.fn(), + onFinalTranscript: vi.fn(), + onEndOfTurn: vi.fn(async () => true), + onPreemptiveGeneration: vi.fn(), + retrieveChatCtx: () => ChatContext.empty(), + }; +} + +describe('TestExponentialMovingAverage', () => { + it('test_initialization_with_valid_alpha', () => { + const ema = new ExpFilter(0.5); + expect(ema.value).toBeUndefined(); + + const emaWithInitial = new ExpFilter(0.5, undefined, undefined, 10); + expect(emaWithInitial.value).toBe(10); + + const emaAlphaOne = new ExpFilter(1.0); + expect(emaAlphaOne.value).toBeUndefined(); + }); + + it('test_initialization_with_invalid_alpha', () => { + expect(() => new ExpFilter(0.0)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(-0.5)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(1.5)).toThrow(/alpha must be in/); + }); + + it('test_update_with_no_initial_value', () => { + const ema = new ExpFilter(0.5); + const result = ema.apply(1, 10); + expect(result).toBe(10); + expect(ema.value).toBe(10); + }); + + it('test_update_with_initial_value', () => { + const ema = new ExpFilter(0.5, undefined, undefined, 10); + const result = ema.apply(1, 20); + expect(result).toBe(15); + expect(ema.value).toBe(15); + }); + + it('test_update_multiple_times', () => { + const ema = new ExpFilter(0.5, undefined, undefined, 10); + ema.apply(1, 20); + ema.apply(1, 20); + expect(ema.value).toBe(17.5); + }); + + it('test_reset', () => { + let ema = new ExpFilter(0.5, undefined, undefined, 10); + expect(ema.value).toBe(10); + ema.reset(); + expect(ema.value).toBe(10); + + ema = new ExpFilter(0.5, undefined, undefined, 10); + ema.reset({ initial: 5 }); + expect(ema.value).toBe(5); + }); +}); + +describe('TestDynamicEndpointing', () => { + it('test_initialization', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_initialization_with_custom_alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.2); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_initialization_uses_updated_default_alpha', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.9, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.9, 5); + }); + + it('test_empty_delays', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.betweenUtteranceDelay).toBe(0); + expect(ep.betweenTurnDelay).toBe(0); + expect(ep.immediateInterruptionDelay).toEqual([0, 0]); + }); + + it('test_on_utterance_ended', () => { + let ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + expect(privateState(ep)._utteranceEndedAt).toBe(100000); + + ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(99900); + expect(privateState(ep)._utteranceEndedAt).toBe(99900); + }); + + it('test_on_utterance_started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfSpeech(100000); + expect(privateState(ep)._utteranceStartedAt).toBe(100000); + }); + + it('test_on_agent_speech_started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + expect(privateState(ep)._agentSpeechStartedAt).toBe(100000); + }); + + it('test_between_utterance_delay_calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100500); + expect(ep.betweenUtteranceDelay).toBeCloseTo(500, 5); + }); + + it('test_between_turn_delay_calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100800); + expect(ep.betweenTurnDelay).toBeCloseTo(800, 5); + }); + + it('test_pause_between_utterances_updates_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400); + ep.onEndOfSpeech(100500, false); + + const expected = 0.5 * 400 + 0.5 * initialMin; + expect(ep.minDelay).toBeCloseTo(expected, 5); + }); + + it('test_new_turn_updates_max_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100600); + ep.onStartOfSpeech(101500); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 600 + 0.5 * 1000, 5); + }); + + it('test_interruption_updates_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + expect(privateState(ep)._agentSpeechStartedAt).not.toBeUndefined(); + ep.onStartOfSpeech(100250, true); + expect(privateState(ep)._overlapping).toBe(true); + + ep.onEndOfSpeech(100500); + + expect(privateState(ep)._overlapping).toBe(false); + expect(privateState(ep)._agentSpeechStartedAt).toBeUndefined(); + expect(ep.minDelay).toBeCloseTo(300, 5); + }); + + it('test_update_options', () => { + let ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500 }); + expect(ep.minDelay).toBe(500); + expect(privateState(ep)._minDelay).toBe(500); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ maxDelay: 2000 }); + expect(ep.maxDelay).toBe(2000); + expect(privateState(ep)._maxDelay).toBe(2000); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(ep.minDelay).toBe(500); + expect(ep.maxDelay).toBe(2000); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions(); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_max_delay_clamped_to_configured_max', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(102000); + ep.onStartOfSpeech(105000); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_max_delay_clamped_to_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100100); + ep.onStartOfSpeech(100500); + expect(ep.maxDelay).toBeGreaterThanOrEqual(privateState(ep)._minDelay); + }); + + it('test_non_interruption_clears_agent_speech', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + expect(privateState(ep)._agentSpeechStartedAt).not.toBeUndefined(); + + ep.onStartOfSpeech(102000); + ep.onEndOfSpeech(103000, false); + expect(privateState(ep)._agentSpeechStartedAt).toBeUndefined(); + }); + + it('test_consecutive_interruptions_only_track_first', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(privateState(ep)._overlapping).toBe(true); + const prevVal = [ep.minDelay, ep.maxDelay]; + + ep.onStartOfSpeech(100350); + + expect(privateState(ep)._overlapping).toBe(true); + expect(prevVal).toEqual([ep.minDelay, ep.maxDelay]); + }); + + it('test_delayed_interruption_updates_max_delay_without_crashing', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('test_interruption_adjusts_stale_utterance_end_time', () => { + const ep = new DynamicEndpointing(60, 1000, 1.0); + ep.onEndOfSpeech(99000); + ep.onStartOfSpeech(100000); + + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(Math.abs(Number(privateState(ep)._utteranceEndedAt) - 100200)).toBeLessThanOrEqual(1); + expect(ep.minDelay).toBeCloseTo(60, 5); + expect(ep.maxDelay).toBeCloseTo(1000, 5); + }); + + it('test_update_options_preserves_filter_alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 600, maxDelay: 2000 }); + + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.5, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.5, 5); + }); + + it('test_update_options_updates_alpha_in_place', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + ep.onEndOfSpeech(101000); + const learnedMin = ep.minDelay; + + ep.updateOptions({ alpha: 0.2 }); + + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.2, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.2, 5); + expect(ep.minDelay).toBeCloseTo(learnedMin, 5); + }); + + it('test_update_options_updates_filter_clamp_bounds', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(privateState(privateState(ep)._utterancePause)._minVal).toBe(500); + expect(privateState(privateState(ep)._turnPause)._maxVal).toBe(2000); + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + expect(ep.minDelay).toBeCloseTo(500, 5); + + ep.onEndOfSpeech(101000); + ep.onStartOfAgentSpeech(102800); + ep.onStartOfSpeech(103500); + expect(ep.maxDelay).toBeGreaterThan(1000); + expect(ep.maxDelay).toBeLessThanOrEqual(2000); + }); + + it('test_should_ignore_skips_filter_update', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101500, true); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + + ep.onEndOfSpeech(101800, true); + + expect(ep.minDelay).toBe(prevMin); + expect(ep.maxDelay).toBe(prevMax); + expect(privateState(ep)._utteranceStartedAt).toBeUndefined(); + expect(privateState(ep)._utteranceEndedAt).toBeUndefined(); + expect(privateState(ep)._overlapping).toBe(false); + expect(privateState(ep)._speaking).toBe(false); + }); + + it('test_should_ignore_without_overlapping_still_updates', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400, false); + ep.onEndOfSpeech(100600, true); + + const expected = 0.5 * 400 + 0.5 * initialMin; + expect(ep.minDelay).toBeCloseTo(expected, 5); + }); + + it('test_should_ignore_grace_period_overrides', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(100600, true); + + ep.onEndOfSpeech(100800, true); + + expect(privateState(ep)._utteranceEndedAt).toBe(100800); + expect(privateState(ep)._speaking).toBe(false); + }); + + it('test_should_ignore_outside_grace_period', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101000, true); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + ep.onEndOfSpeech(101500, true); + + expect(ep.minDelay).toBe(prevMin); + expect(ep.maxDelay).toBe(prevMax); + expect(privateState(ep)._utteranceStartedAt).toBeUndefined(); + expect(privateState(ep)._utteranceEndedAt).toBeUndefined(); + }); + + it('test_on_end_of_agent_speech_clears_state', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + ep.onStartOfSpeech(100100, true); + expect(privateState(ep)._overlapping).toBe(true); + expect(privateState(ep)._agentSpeechStartedAt).toBe(100000); + + ep.onEndOfAgentSpeech(101000); + + expect(privateState(ep)._agentSpeechEndedAt).toBe(101000); + expect(privateState(ep)._agentSpeechStartedAt).toBe(100000); + expect(privateState(ep)._overlapping).toBe(false); + }); + + it('test_overlapping_inferred_from_agent_speech', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800, false); + ep.onEndOfSpeech(102000); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('test_speaking_flag_set_and_cleared', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(privateState(ep)._speaking).toBe(false); + ep.onStartOfSpeech(100000); + expect(privateState(ep)._speaking).toBe(true); + ep.onEndOfSpeech(100500); + expect(privateState(ep)._speaking).toBe(false); + }); + + it.each([ + ['no_agent/no_overlap/no_ignore', 'none', false, false, false, true, false], + ['no_agent/no_overlap/ignore', 'none', false, true, false, true, false], + ['agent_ended/no_overlap/no_ignore', 'ended', false, false, false, false, true], + ['agent_ended/no_overlap/ignore', 'ended', false, true, false, false, true], + ['agent_active/no_overlap/no_ignore', 'active', false, false, false, false, true], + ['agent_active/no_overlap/ignore', 'active', false, true, false, false, true], + ['agent_active/overlap/no_ignore', 'active', true, false, false, true, false], + ['agent_active/overlap/ignore/outside_grace', 'active', true, true, false, false, false], + ['agent_active/overlap/ignore/inside_grace', 'active', true, true, true, true, false], + ] as const)( + 'test_all_overlapping_and_should_ignore_combos %s', + ( + label, + agentSpeech, + overlapping, + shouldIgnore, + withinGrace, + expectMinChange, + expectMaxChange, + ) => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(99000); + ep.onEndOfSpeech(100000); + + let userStart: number; + if (agentSpeech === 'ended') { + ep.onStartOfAgentSpeech(100500); + ep.onEndOfAgentSpeech(101000); + userStart = 101500; + } else if (agentSpeech === 'active') { + if (withinGrace) { + ep.onStartOfAgentSpeech(100150); + userStart = 100350; + } else if (overlapping && shouldIgnore) { + ep.onStartOfAgentSpeech(100200); + userStart = 101500; + } else if (overlapping) { + ep.onStartOfAgentSpeech(100150); + userStart = 100400; + } else { + ep.onStartOfAgentSpeech(100900); + userStart = 101800; + } + } else { + userStart = 100400; + } + + ep.onStartOfSpeech(userStart, overlapping); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + + ep.onEndOfSpeech(userStart + 500, shouldIgnore); + + const minChanged = ep.minDelay !== prevMin; + const maxChanged = ep.maxDelay !== prevMax; + + expect(minChanged, `[${label}] min_delay change`).toBe(expectMinChange); + expect(maxChanged, `[${label}] max_delay change`).toBe(expectMaxChange); + expect(privateState(ep)._speaking, `[${label}] _speaking should be false`).toBe(false); + expect(privateState(ep)._overlapping, `[${label}] _overlapping should be false`).toBe(false); + }, + ); + + it('test_full_conversation_sequence', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(100000); + ep.onEndOfSpeech(101000); + + ep.onStartOfAgentSpeech(101500); + + ep.onStartOfSpeech(102500, true); + const minBeforeBackchannel = ep.minDelay; + const maxBeforeBackchannel = ep.maxDelay; + ep.onEndOfSpeech(102800, true); + + expect(ep.minDelay).toBe(minBeforeBackchannel); + expect(ep.maxDelay).toBe(maxBeforeBackchannel); + + ep.onEndOfAgentSpeech(103000); + + ep.onStartOfSpeech(103500); + ep.onEndOfSpeech(104000); + + expect(privateState(ep)._speaking).toBe(false); + expect(privateState(ep)._agentSpeechStartedAt).toBeUndefined(); + }); +}); + +describe('TestCreateEndpointing', () => { + it('test_dynamic_mode_wires_alpha', () => { + const ep = createEndpointing({ mode: 'dynamic', minDelay: 300, maxDelay: 1000, alpha: 0.7 }); + + expect(ep).toBeInstanceOf(DynamicEndpointing); + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.7, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.7, 5); + }); + + it('test_fixed_mode_returns_base_endpointing', () => { + const ep = createEndpointing({ mode: 'fixed', minDelay: 500, maxDelay: 3000, alpha: 0.9 }); + + expect(ep).not.toBeInstanceOf(DynamicEndpointing); + expect(ep.minDelay).toBe(500); + expect(ep.maxDelay).toBe(3000); + }); +}); + +describe('AudioRecognition dynamic endpointing integration', () => { + it('forwards speech lifecycle to endpointing with explicit timestamps', async () => { + const endpointing = new DynamicEndpointing(300, 1000, 0.5); + const recognition = new AudioRecognition({ recognitionHooks: createHooks(), endpointing }); + + await recognition.onStartOfSpeech(99000); + privateState(recognition).speaking = true; + await recognition.onEndOfSpeech(100000); + + await recognition.onStartOfAgentSpeech(100150); + await recognition.onStartOfSpeech(100350, 0); + privateState(recognition).speaking = true; + await recognition.onEndOfSpeech(100800, undefined, true); + + expect(endpointing.minDelay).toBeCloseTo(0.5 * 350 + 0.5 * 300, 5); + }); + + it('updateOptions replaces endpointing state instead of mutating learned history', () => { + const first = new DynamicEndpointing(300, 1000, 0.5); + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + endpointing: first, + }); + + first.onEndOfSpeech(100000); + first.onStartOfSpeech(100400); + first.onEndOfSpeech(100900); + expect(first.minDelay).toBeGreaterThan(300); + + const replacement = createEndpointing({ + mode: 'dynamic', + minDelay: 500, + maxDelay: 2000, + alpha: 0.2, + }); + recognition.updateOptions({ endpointing: replacement }); + + const endpointingState = privateState(recognition).endpointing as DynamicEndpointing; + expect(endpointingState).toBe(replacement); + expect(endpointingState.minDelay).toBe(500); + expect(endpointingState.maxDelay).toBe(2000); + }); + + it('agent activity updateOptions recreates endpointing state for active recognition', () => { + const updateOptions = vi.fn(); + const activity = Object.create(AgentActivity.prototype) as { + updateOptions: AgentActivity['updateOptions']; + audioRecognition?: { updateOptions: typeof updateOptions }; + turnDetectionMode?: 'vad'; + }; + activity.audioRecognition = { updateOptions }; + activity.turnDetectionMode = 'vad'; + + activity.updateOptions({ + endpointing: { mode: 'dynamic', minDelay: 500, maxDelay: 2000, alpha: 0.2 }, + }); + + const endpointingState = updateOptions.mock.calls[0][0].endpointing as DynamicEndpointing; + expect(endpointingState).toBeInstanceOf(DynamicEndpointing); + expect(endpointingState.minDelay).toBe(500); + expect(endpointingState.maxDelay).toBe(2000); + }); +}); diff --git a/agents/src/voice/endpointing.ts b/agents/src/voice/endpointing.ts new file mode 100644 index 000000000..73ac819e4 --- /dev/null +++ b/agents/src/voice/endpointing.ts @@ -0,0 +1,269 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ExpFilter } from '../utils.js'; +import type { EndpointingOptions } from './turn_config/endpointing.js'; + +export type { EndpointingOptions } from './turn_config/endpointing.js'; + +const AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD = 250; + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 10-47 lines +export class BaseEndpointing { + protected _minDelay: number; + protected _maxDelay: number; + protected _overlapping = false; + + constructor(minDelay: number, maxDelay: number) { + this._minDelay = minDelay; + this._maxDelay = maxDelay; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 16-22 lines + updateOptions({ + minDelay, + maxDelay, + }: { + minDelay?: number; + maxDelay?: number; + } = {}): void { + if (minDelay !== undefined) { + this._minDelay = minDelay; + } + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + } + } + + get minDelay(): number { + return this._minDelay; + } + + get maxDelay(): number { + return this._maxDelay; + } + + get overlapping(): boolean { + return this._overlapping; + } + + onStartOfSpeech(startedAt: number, overlapping = false): void { + void startedAt; + this._overlapping = overlapping; + } + + onEndOfSpeech(endedAt: number, shouldIgnore = false): void { + void endedAt; + void shouldIgnore; + this._overlapping = false; + } + + onStartOfAgentSpeech(startedAt: number): void { + void startedAt; + } + + onEndOfAgentSpeech(endedAt: number): void { + void endedAt; + } +} + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 49-89 lines +export class DynamicEndpointing extends BaseEndpointing { + private _utterancePause: ExpFilter; + private _turnPause: ExpFilter; + private _utteranceStartedAt: number | undefined; + private _utteranceEndedAt: number | undefined; + private _agentSpeechStartedAt: number | undefined; + private _agentSpeechEndedAt: number | undefined; + private _speaking = false; + + constructor(minDelay: number, maxDelay: number, alpha = 0.9) { + super(minDelay, maxDelay); + + this._utterancePause = new ExpFilter(alpha, maxDelay, minDelay, minDelay); + this._turnPause = new ExpFilter(alpha, maxDelay, minDelay, maxDelay); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 91-102 lines + override get minDelay(): number { + return this._utterancePause.value ?? this._minDelay; + } + + override get maxDelay(): number { + const turnVal = this._turnPause.value ?? this._maxDelay; + return Math.max(turnVal, this.minDelay); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 104-120 lines + get betweenUtteranceDelay(): number { + if (this._utteranceEndedAt === undefined) { + return 0; + } + if (this._utteranceStartedAt === undefined) { + return 0; + } + + return Math.max(0, this._utteranceStartedAt - this._utteranceEndedAt); + } + + get betweenTurnDelay(): number { + if (this._agentSpeechStartedAt === undefined) { + return 0; + } + if (this._utteranceEndedAt === undefined) { + return 0; + } + + return Math.max(0, this._agentSpeechStartedAt - this._utteranceEndedAt); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 122-137 lines + get immediateInterruptionDelay(): [number, number] { + if (this._utteranceStartedAt === undefined) { + return [0, 0]; + } + if (this._agentSpeechStartedAt === undefined) { + return [0, 0]; + } + + return [this.betweenTurnDelay, Math.abs(this.betweenUtteranceDelay - this.betweenTurnDelay)]; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 139-153 lines + override onStartOfAgentSpeech(startedAt: number): void { + this._agentSpeechStartedAt = startedAt; + this._agentSpeechEndedAt = undefined; + this._overlapping = false; + } + + override onEndOfAgentSpeech(endedAt: number): void { + if ( + this._agentSpeechStartedAt !== undefined && + (this._agentSpeechEndedAt === undefined || + this._agentSpeechEndedAt < this._agentSpeechStartedAt) + ) { + this._agentSpeechEndedAt = endedAt; + } + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 155-178 lines + override onStartOfSpeech(startedAt: number, overlapping = false): void { + if (this._overlapping) { + return; + } + + if ( + this._utteranceStartedAt !== undefined && + this._utteranceEndedAt !== undefined && + this._agentSpeechStartedAt !== undefined && + this._utteranceEndedAt < this._utteranceStartedAt && + overlapping + ) { + this._utteranceEndedAt = this._agentSpeechStartedAt - 1; + } + + this._utteranceStartedAt = startedAt; + this._overlapping = overlapping; + this._speaking = true; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 179-286 lines + override onEndOfSpeech(endedAt: number, shouldIgnore = false): void { + if (shouldIgnore && this._overlapping) { + const withinGracePeriod = + this._utteranceStartedAt !== undefined && + this._agentSpeechStartedAt !== undefined && + Math.abs(this._utteranceStartedAt - this._agentSpeechStartedAt) < + AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD; + if (!withinGracePeriod) { + this._overlapping = false; + this._speaking = false; + this._utteranceStartedAt = undefined; + this._utteranceEndedAt = undefined; + return; + } + } + + if ( + this._overlapping || + (this._agentSpeechStartedAt !== undefined && this._agentSpeechEndedAt === undefined) + ) { + const [turnDelay, interruptionDelay] = this.immediateInterruptionDelay; + const utterancePause = this.betweenUtteranceDelay; + if ( + 0 < interruptionDelay && + interruptionDelay <= this.minDelay && + 0 < turnDelay && + turnDelay <= this.maxDelay && + utterancePause > 0 + ) { + this._utterancePause.apply(1, utterancePause); + } else { + const turnPause = this.betweenTurnDelay; + if (turnPause > 0) { + this._turnPause.apply(1, turnPause); + } + } + } else { + const turnPause = this.betweenTurnDelay; + if (turnPause > 0) { + this._turnPause.apply(1, turnPause); + } else { + const utterancePause = this.betweenUtteranceDelay; + if ( + utterancePause > 0 && + this._agentSpeechEndedAt === undefined && + this._agentSpeechStartedAt === undefined + ) { + this._utterancePause.apply(1, utterancePause); + } + } + } + + this._utteranceEndedAt = endedAt; + this._agentSpeechStartedAt = undefined; + this._agentSpeechEndedAt = undefined; + this._speaking = false; + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 288-307 lines + override updateOptions({ + minDelay, + maxDelay, + alpha, + }: { + minDelay?: number; + maxDelay?: number; + alpha?: number; + } = {}): void { + if (minDelay !== undefined) { + this._minDelay = minDelay; + this._utterancePause.reset({ initial: this._minDelay, minVal: this._minDelay }); + this._turnPause.reset({ minVal: this._minDelay }); + } + + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + this._turnPause.reset({ initial: this._maxDelay, maxVal: this._maxDelay }); + this._utterancePause.reset({ maxVal: this._maxDelay }); + } + + if (alpha !== undefined) { + this._utterancePause.reset({ alpha }); + this._turnPause.reset({ alpha }); + } + } +} + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 310-322 lines +export function createEndpointing(options: EndpointingOptions): BaseEndpointing { + switch (options.mode) { + case 'dynamic': + return new DynamicEndpointing(options.minDelay, options.maxDelay, options.alpha); + case 'fixed': + default: + return new BaseEndpointing(options.minDelay, options.maxDelay); + } +} diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index 808ac88c1..c3791e80a 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -11,6 +11,7 @@ export { } from './agent_session.js'; export * from './avatar/index.js'; export * from './background_audio.js'; +export * from './endpointing.js'; export { type TextInputCallback, type TextInputEvent, diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index f2603e00f..a4924647d 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -4,10 +4,11 @@ /** * Configuration for endpointing, which determines when the user's turn is complete. */ +// Ref: python livekit-agents/livekit/agents/voice/turn.py - 47-66 lines export interface EndpointingOptions { /** * Endpointing mode. `"fixed"` uses a fixed delay, `"dynamic"` adjusts delay based on - * end-of-utterance prediction. + * observed speech and turn pauses. * @defaultValue "fixed" */ mode: 'fixed' | 'dynamic'; @@ -24,10 +25,18 @@ export interface EndpointingOptions { * @defaultValue 3000 */ maxDelay: number; + /** + * Exponential moving average coefficient for dynamic endpointing. Higher values give more + * weight to previous pause history. + * @defaultValue 0.9 + */ + alpha: number; } +// Ref: python livekit-agents/livekit/agents/voice/turn.py - 69-74 lines export const defaultEndpointingOptions = { mode: 'fixed', minDelay: 500, maxDelay: 3000, + alpha: 0.9, } as const satisfies EndpointingOptions; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 90010c2c6..f15bd3143 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -46,6 +46,20 @@ describe('migrateLegacyOptions', () => { expect(result.turnHandling.endpointing!.maxDelay).toBe(5000); }); + it('should preserve dynamic endpointing alpha from turnHandling config', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + turnHandling: { + endpointing: { + mode: 'dynamic', + alpha: 0.7, + }, + }, + }); + + expect(result.turnHandling.endpointing!.mode).toBe('dynamic'); + expect(result.turnHandling.endpointing!.alpha).toBe(0.7); + }); + it('should set interruption.enabled to false when allowInterruptions is false', () => { const { agentSessionOptions: result } = migrateLegacyOptions({ voiceOptions: { allowInterruptions: false },