From fb6bfe4ebf58a4d2401a791b2197778fb2690de8 Mon Sep 17 00:00:00 2001 From: Rosetta Bot Date: Tue, 28 Apr 2026 15:50:25 +0000 Subject: [PATCH 1/3] feat(voice): add dynamic endpointing --- .changeset/dynamic-endpointing.md | 5 + agents/src/utils.ts | 79 ++- agents/src/voice/agent_activity.ts | 144 ++-- agents/src/voice/audio_recognition.ts | 75 ++- .../voice/audio_recognition_handoff.test.ts | 4 +- .../src/voice/audio_recognition_span.test.ts | 7 +- agents/src/voice/endpointing.test.ts | 618 ++++++++++++++++++ agents/src/voice/endpointing.ts | 317 +++++++++ agents/src/voice/index.ts | 1 + agents/src/voice/turn_config/endpointing.ts | 10 +- agents/src/voice/turn_config/utils.test.ts | 14 + 11 files changed, 1178 insertions(+), 96 deletions(-) create mode 100644 .changeset/dynamic-endpointing.md create mode 100644 agents/src/voice/endpointing.test.ts create mode 100644 agents/src/voice/endpointing.ts diff --git a/.changeset/dynamic-endpointing.md b/.changeset/dynamic-endpointing.md new file mode 100644 index 000000000..186d8f86a --- /dev/null +++ b/.changeset/dynamic-endpointing.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents': patch +--- + +feat(voice): add dynamic endpointing for adaptive turn delays diff --git a/agents/src/utils.ts b/agents/src/utils.ts index 82c623a6c..06d46f095 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -352,43 +352,80 @@ export class AsyncIterableQueue implements AsyncIterableIterator { /** @internal */ export class ExpFilter { - #alpha: number; - #max?: number; - #filtered?: number = undefined; + private _alpha: number; + private _filtered?: number = undefined; + private _maxVal?: number = undefined; + private _minVal?: number = undefined; + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 6-20 lines + constructor(alpha: number, maxVal?: number, minVal?: number, initial?: number) { + if (!(0 < alpha && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } - constructor(alpha: number, max?: number) { - this.#alpha = alpha; - this.#max = max; + this._alpha = alpha; + this._filtered = initial; + this._maxVal = maxVal; + this._minVal = minVal; } - reset(alpha?: number) { - if (alpha) { - this.#alpha = alpha; + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 21-37 lines + reset(alpha?: number, initial?: number, minVal?: number, maxVal?: number): void { + if (alpha !== undefined) { + if (!(0 < alpha && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } + this._alpha = alpha; + } + if (initial !== undefined) { + this._filtered = initial; + } + if (minVal !== undefined) { + this._minVal = minVal; + } + if (maxVal !== undefined) { + this._maxVal = maxVal; } - this.#filtered = undefined; } - apply(exp: number, sample: number): number { - if (this.#filtered) { - const a = this.#alpha ** exp; - this.#filtered = a * this.#filtered + (1 - a) * sample; - } else { - this.#filtered = sample; + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 38-57 lines + apply(exp: number, sample?: number): number { + const sampleValue = sample === undefined ? this._filtered : sample; + + if (sampleValue !== undefined && this._filtered === undefined) { + this._filtered = sampleValue; + } else if (sampleValue !== undefined && this._filtered !== undefined) { + const a = this._alpha ** exp; + this._filtered = a * this._filtered + (1 - a) * sampleValue; } - if (this.#max && this.#filtered > this.#max) { - this.#filtered = this.#max; + if (this._filtered === undefined) { + throw new Error('sample or initial value must be given.'); } - return this.#filtered; + if (this._maxVal !== undefined && this._filtered > this._maxVal) { + this._filtered = this._maxVal; + } + + if (this._minVal !== undefined && this._filtered < this._minVal) { + this._filtered = this._minVal; + } + + return this._filtered; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 59-61 lines + get value(): number | undefined { + return this._filtered; } get filtered(): number | undefined { - return this.#filtered; + return this.value; } + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 63-64 lines set alpha(alpha: number) { - this.#alpha = alpha; + this._alpha = alpha; } } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 9b7482373..27e07efc4 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -65,6 +65,7 @@ import { type RecognitionHooks, type STTPipeline, } from './audio_recognition.js'; +import { createEndpointing } from './endpointing.js'; import { AgentSessionEventTypes, createErrorEvent, @@ -88,6 +89,7 @@ import { } from './generation.js'; import type { TimedString } from './io.js'; import { SpeechHandle } from './speech_handle.js'; +import type { EndpointingOptions } from './turn_config/endpointing.js'; import { setParticipantSpanAttributes } from './utils.js'; export const agentActivityStorage = new AsyncLocalStorage(); @@ -186,6 +188,8 @@ export class AgentActivity implements RecognitionHooks { private _preemptiveGenerationCount = 0; private interruptionDetector?: AdaptiveInterruptionDetector; private isInterruptionDetectionEnabled: boolean; + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 203-207 lines + private interruptionDetected = false; private isInterruptionByAudioActivityEnabled: boolean; private isDefaultInterruptionByAudioActivityEnabled: boolean; @@ -206,6 +210,8 @@ export class AgentActivity implements RecognitionHooks { this.onError(ev); private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1494-1499 lines + this.interruptionDetected = ev.isInterruption; this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev); }; @@ -477,12 +483,7 @@ export class AgentActivity implements RecognitionHooks { turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, interruptionDetection: this.interruptionDetector, - minEndpointingDelay: - this.agent.turnHandling?.endpointing?.minDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.minDelay, - maxEndpointingDelay: - this.agent.turnHandling?.endpointing?.maxDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, + endpointing: createEndpointing(this.endpointingOptions), rootSpanContext: this.agentSession.rootSpanContext, sttModel: this.stt?.label, sttProvider: this.getSttProvider(), @@ -661,19 +662,17 @@ export class AgentActivity implements RecognitionHooks { return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling; } - // get minEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.minDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay - // ); - // } - - // get maxEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.maxDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay - // ); - // } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 330-339 lines + get endpointingOptions(): EndpointingOptions { + const agentEndpointing = this.agent.turnHandling?.endpointing; + const sessionEndpointing = this.agentSession.sessionOptions.turnHandling.endpointing; + return { + mode: agentEndpointing?.mode ?? sessionEndpointing.mode, + minDelay: agentEndpointing?.minDelay ?? sessionEndpointing.minDelay, + maxDelay: agentEndpointing?.maxDelay ?? sessionEndpointing.maxDelay, + alpha: agentEndpointing?.alpha ?? sessionEndpointing.alpha, + }; + } get toolCtx(): ToolContext { return this.agent.toolCtx; @@ -717,9 +716,11 @@ export class AgentActivity implements RecognitionHooks { updateOptions({ toolChoice, + endpointingOptions, turnDetection, }: { toolChoice?: ToolChoice | null; + endpointingOptions?: EndpointingOptions; turnDetection?: TurnDetectionMode; }): void { if (toolChoice !== undefined) { @@ -743,7 +744,10 @@ export class AgentActivity implements RecognitionHooks { } if (this.audioRecognition) { - this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode }); + this.audioRecognition.updateOptions({ + endpointing: endpointingOptions ? createEndpointing(endpointingOptions) : undefined, + turnDetection: this.turnDetectionMode, + }); } } @@ -922,12 +926,9 @@ export class AgentActivity implements RecognitionHooks { if (!this.vad) { this.agentSession._updateUserState('speaking'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfOverlapSpeech( - 0, - Date.now(), - this.agentSession._userSpeakingSpan, - ); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1501-1508 lines + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); } } @@ -947,8 +948,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1519-1525 lines + this.audioRecognition.onEndOfSpeech(Date.now(), this.agentSession._userSpeakingSpan); } this.agentSession._updateUserState('listening'); } @@ -1030,14 +1032,15 @@ export class AgentActivity implements RecognitionHooks { lastSpeakingTime: speechStartTime, otelContext: otelContext.active(), }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - // Pass speechStartTime as the absolute startedAt timestamp. - this.audioRecognition.onStartOfOverlapSpeech( - ev.speechDuration, - speechStartTime, - this.agentSession._userSpeakingSpan, - ); - } + if (this.isInterruptionDetectionEnabled) { + this.interruptionDetected = false; + } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1653-1664 lines + this.audioRecognition?.onStartOfSpeech( + speechStartTime, + ev.speechDuration, + this.agentSession._userSpeakingSpan, + ); } onEndOfSpeech(ev: VADEvent): void { @@ -1046,13 +1049,12 @@ export class AgentActivity implements RecognitionHooks { // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency. speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration; } - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - // Pass speechEndTime as the absolute endedAt timestamp. - this.audioRecognition.onEndOfOverlapSpeech( - speechEndTime, - this.agentSession._userSpeakingSpan, - ); - } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1689-1703 lines + this.audioRecognition?.onEndOfSpeech( + speechEndTime, + this.agentSession._userSpeakingSpan, + this.isInterruptionDetectionEnabled ? this.interruptionDetected : undefined, + ); this.agentSession._updateUserState('listening', { lastSpeakingTime: speechEndTime, otelContext: otelContext.active(), @@ -1121,6 +1123,14 @@ export class AgentActivity implements RecognitionHooks { { 'speech id': this._currentSpeech.id }, 'speech interrupted by audio activity', ); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1622-1630 lines + if ( + this.audioRecognition && + !this.audioRecognition.endpointing.overlapping && + this.agentSession.agentState === 'speaking' + ) { + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); + } this.realtimeSession?.interrupt(); this._currentSpeech.interrupt(); } @@ -1839,8 +1849,11 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2214-2222 lines + this.audioRecognition.onStartOfAgentSpeech(replyStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -1932,7 +1945,8 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2109-2113 lines this.audioRecognition.onEndOfAgentSpeech(Date.now()); } this.restoreInterruptionByAudioActivity(); @@ -2123,8 +2137,11 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2570-2579 lines + this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -2286,8 +2303,11 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2338-2342 lines this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { this.restoreInterruptionByAudioActivity(); } } @@ -2329,11 +2349,12 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - { - this.audioRecognition.onEndOfAgentSpeech(Date.now()); - this.restoreInterruptionByAudioActivity(); - } + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2709-2714 lines + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); } } @@ -2539,10 +2560,18 @@ export class AgentActivity implements RecognitionHooks { } const onFirstFrame = (startedSpeakingAt?: number) => { + const agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 3032-3039 lines + this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { + this.isInterruptionByAudioActivityEnabled = false; + } }; const readMessages = async ( @@ -2820,6 +2849,13 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 3213-3219 lines + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); + } } if (toolOutput.output.length === 0) { diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index e77532a5b..c62b346cf 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -35,6 +35,7 @@ import { traceTypes, tracer } from '../telemetry/index.js'; import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; +import type { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; import { setParticipantSpanAttributes } from './utils.js'; @@ -138,10 +139,8 @@ export interface AudioRecognitionOptions { /** Turn detection mode. */ turnDetectionMode?: TurnDetectionMode; interruptionDetection?: AdaptiveInterruptionDetector; - /** Minimum endpointing delay in milliseconds. */ - minEndpointingDelay: number; - /** Maximum endpointing delay in milliseconds. */ - maxEndpointingDelay: number; + /** Endpointing delay state. */ + endpointing: BaseEndpointing; /** Root span context for tracing. */ rootSpanContext?: Context; /** STT model name for tracing */ @@ -170,8 +169,7 @@ export class AudioRecognition { private vad?: VAD; private turnDetector?: _TurnDetector; private turnDetectionMode?: TurnDetectionMode; - private minEndpointingDelay: number; - private maxEndpointingDelay: number; + private _endpointing: BaseEndpointing; private lastLanguage?: LanguageCode; private rootSpanContext?: Context; private sttModel?: string; @@ -224,8 +222,7 @@ export class AudioRecognition { this.vad = opts.vad; this.turnDetector = opts.turnDetector; this.turnDetectionMode = opts.turnDetectionMode; - this.minEndpointingDelay = opts.minEndpointingDelay; - this.maxEndpointingDelay = opts.maxEndpointingDelay; + this._endpointing = opts.endpointing; this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; this.sttModel = opts.sttModel; @@ -275,8 +272,25 @@ export class AudioRecognition { } /** @internal */ - updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void { - this.turnDetectionMode = options.turnDetection; + get endpointing(): BaseEndpointing { + return this._endpointing; + } + + /** @internal */ + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 193-219 lines + updateOptions(options: { + endpointing?: BaseEndpointing; + turnDetection?: TurnDetectionMode | undefined; + minEndpointingDelay?: number; + maxEndpointingDelay?: number; + }): void { + if (options.endpointing !== undefined) { + this._endpointing = options.endpointing; + } + + if (Object.hasOwn(options, 'turnDetection')) { + this.turnDetectionMode = options.turnDetection; + } } async start(options?: { sttPipeline?: STTPipeline }) { @@ -311,12 +325,19 @@ export class AudioRecognition { this.interruptionStreamChannel = undefined; } - async onStartOfAgentSpeech() { + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 239-244 lines + onStartOfAgentSpeech(startedAt: number): void { this.isAgentSpeaking = true; - return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); + this._endpointing.onStartOfAgentSpeech(startedAt); + void this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); } + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 246-271 lines async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (this.isAgentSpeaking) { + this._endpointing.onEndOfAgentSpeech(Date.now()); + } + if (!this.isInterruptionEnabled) { this.isAgentSpeaking = false; return; @@ -344,6 +365,31 @@ export class AudioRecognition { this.isAgentSpeaking = false; } + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 273-290 lines + onStartOfSpeech(startedAt: number, speechDuration: number = 0, userSpeakingSpan?: Span): void { + this._endpointing.onStartOfSpeech(startedAt, this.isAgentSpeaking); + this.speaking = true; + + if (!this.isInterruptionEnabled || !this.isAgentSpeaking) { + return; + } + + void this.onStartOfOverlapSpeech(speechDuration, startedAt, userSpeakingSpan); + } + + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 292-306 lines + onEndOfSpeech(endedAt: number, userSpeakingSpan?: Span, interruption?: boolean): void { + if (this.speaking) { + this._endpointing.onEndOfSpeech( + endedAt, + interruption !== undefined && !interruption && this.isAgentSpeaking, + ); + } + + this.speaking = false; + void this.onEndOfOverlapSpeech(endedAt, userSpeakingSpan); + } + /** Start interruption inference when agent is speaking and overlap speech starts. */ async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) { if (this.isAgentSpeaking) { @@ -805,7 +851,8 @@ export class AudioRecognition { speechStartTime: number | undefined, ) => async (controller: AbortController) => { - let endpointingDelay = this.minEndpointingDelay; + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 949-973 lines + let endpointingDelay = this._endpointing.minDelay; const userTurnSpan = this.ensureUserTurnSpan(); const userTurnCtx = this.userTurnContext(userTurnSpan); @@ -831,7 +878,7 @@ export class AudioRecognition { ); if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) { - endpointingDelay = this.maxEndpointingDelay; + endpointingDelay = this._endpointing.maxDelay; } } catch (error) { this.logger.error(error, 'Error predicting end of turn'); diff --git a/agents/src/voice/audio_recognition_handoff.test.ts b/agents/src/voice/audio_recognition_handoff.test.ts index 76311ec12..204f50462 100644 --- a/agents/src/voice/audio_recognition_handoff.test.ts +++ b/agents/src/voice/audio_recognition_handoff.test.ts @@ -7,6 +7,7 @@ import { ChatContext } from '../llm/chat_context.js'; import { initializeLogger } from '../log.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { AudioRecognition, type RecognitionHooks, STTPipeline } from './audio_recognition.js'; +import { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; function createHooks() { @@ -45,8 +46,7 @@ function createRecognition(sttNode: STTNode, hooks = createHooks()) { recognition: new AudioRecognition({ recognitionHooks: hooks, stt: sttNode, - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), }), }; } diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts index cfe92a821..56ae77646 100644 --- a/agents/src/voice/audio_recognition_span.test.ts +++ b/agents/src/voice/audio_recognition_span.test.ts @@ -22,6 +22,7 @@ import { type RecognitionHooks, type _TurnDetector, } from './audio_recognition.js'; +import { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; function setupInMemoryTracing() { @@ -145,8 +146,7 @@ describe('AudioRecognition user_turn span parity', () => { vad: undefined, turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'stt', - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), sttModel: 'deepgram-nova2', sttProvider: 'deepgram', getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }), @@ -254,8 +254,7 @@ describe('AudioRecognition user_turn span parity', () => { vad: new FakeVAD(vadEvents), turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'vad', - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), sttModel: 'stt-model', sttProvider: 'stt-provider', getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }), diff --git a/agents/src/voice/endpointing.test.ts b/agents/src/voice/endpointing.test.ts new file mode 100644 index 000000000..ff946dbd5 --- /dev/null +++ b/agents/src/voice/endpointing.test.ts @@ -0,0 +1,618 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import { initializeLogger } from '../log.js'; +import { ExpFilter } from '../utils.js'; +import { Agent } from './agent.js'; +import { AgentActivity } from './agent_activity.js'; +import { AudioRecognition, type RecognitionHooks } from './audio_recognition.js'; +import { BaseEndpointing, DynamicEndpointing, createEndpointing } from './endpointing.js'; +import { defaultEndpointingOptions } from './turn_config/endpointing.js'; +import { defaultInterruptionOptions } from './turn_config/interruption.js'; +import { defaultPreemptiveGenerationOptions } from './turn_config/preemptive_generation.js'; + +initializeLogger({ pretty: false, level: 'silent' }); + +type ExpFilterState = { + _alpha: number; + _maxVal?: number; + _minVal?: number; +}; + +type EndpointingState = { + _minDelay: number; + _maxDelay: number; + _overlapping: boolean; + _utterancePause: ExpFilterState; + _turnPause: ExpFilterState; + _utteranceStartedAt?: number; + _utteranceEndedAt?: number; + _agentSpeechStartedAt?: number; + _agentSpeechEndedAt?: number; + _speaking: boolean; +}; + +function state(ep: BaseEndpointing): EndpointingState { + return ep as unknown as EndpointingState; +} + +function createHooks(): RecognitionHooks { + return { + onInterruption: vi.fn(), + onStartOfSpeech: vi.fn(), + onVADInferenceDone: vi.fn(), + onEndOfSpeech: vi.fn(), + onInterimTranscript: vi.fn(), + onFinalTranscript: vi.fn(), + onEndOfTurn: vi.fn(async () => true), + onPreemptiveGeneration: vi.fn(), + retrieveChatCtx: () => ChatContext.empty(), + }; +} + +function createActivityWithRecognition(endpointing: BaseEndpointing) { + const agent = new Agent({ instructions: 'test' }); + const session = { + sessionOptions: { + turnHandling: { + endpointing: defaultEndpointingOptions, + interruption: defaultInterruptionOptions, + preemptiveGeneration: defaultPreemptiveGenerationOptions, + }, + }, + turnDetection: 'vad', + useTtsAlignedTranscript: true, + vad: undefined, + stt: undefined, + llm: undefined, + tts: undefined, + interruptionDetection: undefined, + agentState: 'listening', + _aecWarmupRemaining: 0, + _userSpeakingSpan: undefined, + _updateUserState: vi.fn(), + _updateAgentState: vi.fn(), + emit: vi.fn(), + } as unknown as ConstructorParameters[1]; + const activity = new AgentActivity(agent, session); + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + endpointing, + }); + (activity as unknown as { audioRecognition: AudioRecognition }).audioRecognition = recognition; + return { activity, recognition }; +} + +describe('TestExponentialMovingAverage', () => { + it('test_initialization_with_valid_alpha', () => { + const ema = new ExpFilter(0.5); + expect(ema.value).toBeUndefined(); + + const emaWithInitial = new ExpFilter(0.5, undefined, undefined, 10); + expect(emaWithInitial.value).toBe(10); + + const emaOne = new ExpFilter(1.0); + expect(emaOne.value).toBeUndefined(); + }); + + it('test_initialization_with_invalid_alpha', () => { + expect(() => new ExpFilter(0.0)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(-0.5)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(1.5)).toThrow(/alpha must be in/); + }); + + it('test_update_with_no_initial_value', () => { + const ema = new ExpFilter(0.5); + const result = ema.apply(1, 10); + expect(result).toBe(10); + expect(ema.value).toBe(10); + }); + + it('test_update_with_initial_value', () => { + const ema = new ExpFilter(0.5, undefined, undefined, 10); + const result = ema.apply(1, 20); + expect(result).toBe(15); + expect(ema.value).toBe(15); + }); + + it('test_update_multiple_times', () => { + const ema = new ExpFilter(0.5, undefined, undefined, 10); + ema.apply(1, 20); + ema.apply(1, 20); + expect(ema.value).toBe(17.5); + }); + + it('test_reset', () => { + let ema = new ExpFilter(0.5, undefined, undefined, 10); + expect(ema.value).toBe(10); + ema.reset(); + expect(ema.value).toBe(10); + + ema = new ExpFilter(0.5, undefined, undefined, 10); + ema.reset(undefined, 5); + expect(ema.value).toBe(5); + }); +}); + +describe('TestDynamicEndpointing', () => { + it('test_initialization', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_initialization_with_custom_alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.2); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_initialization_uses_updated_default_alpha', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(state(ep)._utterancePause._alpha).toBeCloseTo(0.9, 5); + expect(state(ep)._turnPause._alpha).toBeCloseTo(0.9, 5); + }); + + it('test_empty_delays', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.betweenUtteranceDelay).toBe(0); + expect(ep.betweenTurnDelay).toBe(0); + expect(ep.immediateInterruptionDelay).toEqual([0, 0]); + }); + + it('test_on_utterance_ended', () => { + let ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + expect(state(ep)._utteranceEndedAt).toBe(100000); + + ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(99900); + expect(state(ep)._utteranceEndedAt).toBe(99900); + }); + + it('test_on_utterance_started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfSpeech(100000); + expect(state(ep)._utteranceStartedAt).toBe(100000); + }); + + it('test_on_agent_speech_started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + expect(state(ep)._agentSpeechStartedAt).toBe(100000); + }); + + it('test_between_utterance_delay_calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100500); + expect(ep.betweenUtteranceDelay).toBeCloseTo(500, 5); + }); + + it('test_between_turn_delay_calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100800); + expect(ep.betweenTurnDelay).toBeCloseTo(800, 5); + }); + + it('test_pause_between_utterances_updates_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400); + ep.onEndOfSpeech(100500, false); + expect(ep.minDelay).toBeCloseTo(0.5 * 400 + 0.5 * initialMin, 5); + }); + + it('test_new_turn_updates_max_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100600); + ep.onStartOfSpeech(101500); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 600 + 0.5 * 1000, 5); + }); + + it('test_interruption_updates_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + expect(state(ep)._agentSpeechStartedAt).not.toBeUndefined(); + ep.onStartOfSpeech(100250, true); + expect(state(ep)._overlapping).toBe(true); + + ep.onEndOfSpeech(100500); + + expect(state(ep)._overlapping).toBe(false); + expect(state(ep)._agentSpeechStartedAt).toBeUndefined(); + expect(ep.minDelay).toBeCloseTo(300, 5); + }); + + it('test_update_options', () => { + let ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500 }); + expect(ep.minDelay).toBe(500); + expect(state(ep)._minDelay).toBe(500); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ maxDelay: 2000 }); + expect(ep.maxDelay).toBe(2000); + expect(state(ep)._maxDelay).toBe(2000); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(ep.minDelay).toBe(500); + expect(ep.maxDelay).toBe(2000); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions(); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_max_delay_clamped_to_configured_max', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(102000); + ep.onStartOfSpeech(105000); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_max_delay_clamped_to_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100100); + ep.onStartOfSpeech(100500); + expect(ep.maxDelay).toBeGreaterThanOrEqual(state(ep)._minDelay); + }); + + it('test_non_interruption_clears_agent_speech', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + expect(state(ep)._agentSpeechStartedAt).not.toBeUndefined(); + + ep.onStartOfSpeech(102000); + ep.onEndOfSpeech(103000, false); + expect(state(ep)._agentSpeechStartedAt).toBeUndefined(); + }); + + it('test_consecutive_interruptions_only_track_first', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(state(ep)._overlapping).toBe(true); + const prevVal = [ep.minDelay, ep.maxDelay]; + + ep.onStartOfSpeech(100350); + + expect(state(ep)._overlapping).toBe(true); + expect(prevVal).toEqual([ep.minDelay, ep.maxDelay]); + }); + + it('test_delayed_interruption_updates_max_delay_without_crashing', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('test_interruption_adjusts_stale_utterance_end_time', () => { + const ep = new DynamicEndpointing(60, 1000, 1.0); + ep.onEndOfSpeech(99000); + ep.onStartOfSpeech(100000); + + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(state(ep)._utteranceEndedAt).toBeCloseTo(100199, 3); + expect(ep.minDelay).toBeCloseTo(60, 5); + expect(ep.maxDelay).toBeCloseTo(1000, 5); + }); + + it('test_update_options_preserves_filter_alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 600, maxDelay: 2000 }); + expect(state(ep)._utterancePause._alpha).toBeCloseTo(0.5, 5); + expect(state(ep)._turnPause._alpha).toBeCloseTo(0.5, 5); + }); + + it('test_update_options_updates_alpha_in_place', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + ep.onEndOfSpeech(101000); + const learnedMin = ep.minDelay; + + ep.updateOptions({ alpha: 0.2 }); + + expect(state(ep)._utterancePause._alpha).toBeCloseTo(0.2, 5); + expect(state(ep)._turnPause._alpha).toBeCloseTo(0.2, 5); + expect(ep.minDelay).toBeCloseTo(learnedMin, 5); + }); + + it('test_update_options_updates_filter_clamp_bounds', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(state(ep)._utterancePause._minVal).toBe(500); + expect(state(ep)._turnPause._maxVal).toBe(2000); + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + expect(ep.minDelay).toBeCloseTo(500, 5); + + ep.onEndOfSpeech(101000); + ep.onStartOfAgentSpeech(102800); + ep.onStartOfSpeech(103500); + expect(ep.maxDelay).toBeGreaterThan(1000); + expect(ep.maxDelay).toBeLessThanOrEqual(2000); + }); + + it('test_should_ignore_skips_filter_update', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101500, true); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + + ep.onEndOfSpeech(101800, true); + + expect(ep.minDelay).toBe(prevMin); + expect(ep.maxDelay).toBe(prevMax); + expect(state(ep)._utteranceStartedAt).toBeUndefined(); + expect(state(ep)._utteranceEndedAt).toBeUndefined(); + expect(state(ep)._overlapping).toBe(false); + expect(state(ep)._speaking).toBe(false); + }); + + it('test_should_ignore_without_overlapping_still_updates', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400, false); + ep.onEndOfSpeech(100600, true); + + expect(ep.minDelay).toBeCloseTo(0.5 * 400 + 0.5 * initialMin, 5); + }); + + it('test_should_ignore_grace_period_overrides', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(100600, true); + + ep.onEndOfSpeech(100800, true); + + expect(state(ep)._utteranceEndedAt).toBe(100800); + expect(state(ep)._speaking).toBe(false); + }); + + it('test_should_ignore_outside_grace_period', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101000, true); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + ep.onEndOfSpeech(101500, true); + + expect(ep.minDelay).toBe(prevMin); + expect(ep.maxDelay).toBe(prevMax); + expect(state(ep)._utteranceStartedAt).toBeUndefined(); + expect(state(ep)._utteranceEndedAt).toBeUndefined(); + }); + + it('test_on_end_of_agent_speech_clears_state', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + ep.onStartOfSpeech(100100, true); + expect(state(ep)._overlapping).toBe(true); + expect(state(ep)._agentSpeechStartedAt).toBe(100000); + + ep.onEndOfAgentSpeech(101000); + + expect(state(ep)._agentSpeechEndedAt).toBe(101000); + expect(state(ep)._agentSpeechStartedAt).toBe(100000); + expect(state(ep)._overlapping).toBe(false); + }); + + it('test_overlapping_inferred_from_agent_speech', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800, false); + ep.onEndOfSpeech(102000); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('test_speaking_flag_set_and_cleared', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(state(ep)._speaking).toBe(false); + ep.onStartOfSpeech(100000); + expect(state(ep)._speaking).toBe(true); + ep.onEndOfSpeech(100500); + expect(state(ep)._speaking).toBe(false); + }); + + it.each([ + ['no_agent/no_overlap/no_ignore', 'none', false, false, false, true, false], + ['no_agent/no_overlap/ignore', 'none', false, true, false, true, false], + ['agent_ended/no_overlap/no_ignore', 'ended', false, false, false, false, true], + ['agent_ended/no_overlap/ignore', 'ended', false, true, false, false, true], + ['agent_active/no_overlap/no_ignore', 'active', false, false, false, false, true], + ['agent_active/no_overlap/ignore', 'active', false, true, false, false, true], + ['agent_active/overlap/no_ignore', 'active', true, false, false, true, false], + ['agent_active/overlap/ignore/outside_grace', 'active', true, true, false, false, false], + ['agent_active/overlap/ignore/inside_grace', 'active', true, true, true, true, false], + ] as const)( + 'test_all_overlapping_and_should_ignore_combos (%s)', + ( + label, + agentSpeech, + overlapping, + shouldIgnore, + withinGrace, + expectMinChange, + expectMaxChange, + ) => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(99000); + ep.onEndOfSpeech(100000); + + let userStart: number; + if (agentSpeech === 'ended') { + ep.onStartOfAgentSpeech(100500); + ep.onEndOfAgentSpeech(101000); + userStart = 101500; + } else if (agentSpeech === 'active') { + if (withinGrace) { + ep.onStartOfAgentSpeech(100150); + userStart = 100350; + } else if (overlapping && shouldIgnore) { + ep.onStartOfAgentSpeech(100200); + userStart = 101500; + } else if (overlapping) { + ep.onStartOfAgentSpeech(100150); + userStart = 100400; + } else { + ep.onStartOfAgentSpeech(100900); + userStart = 101800; + } + } else { + userStart = 100400; + } + + ep.onStartOfSpeech(userStart, overlapping); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + + ep.onEndOfSpeech(userStart + 500, shouldIgnore); + + expect(ep.minDelay !== prevMin, label).toBe(expectMinChange); + expect(ep.maxDelay !== prevMax, label).toBe(expectMaxChange); + expect(state(ep)._speaking, label).toBe(false); + expect(state(ep)._overlapping, label).toBe(false); + }, + ); + + it('test_full_conversation_sequence', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(100000); + ep.onEndOfSpeech(101000); + + ep.onStartOfAgentSpeech(101500); + + ep.onStartOfSpeech(102500, true); + const minBeforeBackchannel = ep.minDelay; + const maxBeforeBackchannel = ep.maxDelay; + ep.onEndOfSpeech(102800, true); + + expect(ep.minDelay).toBe(minBeforeBackchannel); + expect(ep.maxDelay).toBe(maxBeforeBackchannel); + + ep.onEndOfAgentSpeech(103000); + + ep.onStartOfSpeech(103500); + ep.onEndOfSpeech(104000); + + expect(state(ep)._speaking).toBe(false); + expect(state(ep)._agentSpeechStartedAt).toBeUndefined(); + }); +}); + +describe('TestCreateEndpointing', () => { + it('test_dynamic_mode_wires_alpha', () => { + const ep = createEndpointing({ mode: 'dynamic', minDelay: 300, maxDelay: 1000, alpha: 0.7 }); + + expect(ep).toBeInstanceOf(DynamicEndpointing); + expect(state(ep)._utterancePause._alpha).toBeCloseTo(0.7, 5); + expect(state(ep)._turnPause._alpha).toBeCloseTo(0.7, 5); + }); + + it('test_fixed_mode_returns_base_endpointing', () => { + const ep = createEndpointing({ mode: 'fixed', minDelay: 500, maxDelay: 3000, alpha: 0.9 }); + + expect(ep).not.toBeInstanceOf(DynamicEndpointing); + expect(ep.minDelay).toBe(500); + expect(ep.maxDelay).toBe(3000); + }); +}); + +describe('target runtime endpointing integration', () => { + it('AudioRecognition.updateOptions replaces endpointing without mutating old learned state', async () => { + const first = new DynamicEndpointing(300, 1000, 0.5); + first.onEndOfSpeech(100000); + first.onStartOfSpeech(100600); + first.onEndOfSpeech(101000); + const learnedMin = first.minDelay; + + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + endpointing: first, + }); + const replacement = createEndpointing({ + mode: 'dynamic', + minDelay: 500, + maxDelay: 2000, + alpha: 0.2, + }); + + try { + recognition.updateOptions({ endpointing: replacement, turnDetection: undefined }); + expect(recognition.endpointing).toBe(replacement); + expect(first.minDelay).toBeCloseTo(learnedMin, 5); + } finally { + await recognition.close(); + } + }); + + it('AgentActivity.updateOptions replaces live AudioRecognition endpointing', async () => { + const { activity, recognition } = createActivityWithRecognition(new BaseEndpointing(300, 1000)); + + try { + activity.updateOptions({ + endpointingOptions: { mode: 'dynamic', minDelay: 400, maxDelay: 1500, alpha: 0.6 }, + turnDetection: 'vad', + }); + + expect(recognition.endpointing).toBeInstanceOf(DynamicEndpointing); + expect(recognition.endpointing.minDelay).toBe(400); + expect(state(recognition.endpointing)._utterancePause._alpha).toBeCloseTo(0.6, 5); + } finally { + await recognition.close(); + } + }); + + it('AgentActivity realtime input speech path updates endpointing without VAD', async () => { + const endpointing = new DynamicEndpointing(300, 1000, 0.5); + const { activity, recognition } = createActivityWithRecognition(endpointing); + + try { + activity.onInputSpeechStarted({} as Parameters[0]); + expect(state(recognition.endpointing)._speaking).toBe(true); + + activity.onInputSpeechStopped({ + userTranscriptionEnabled: false, + } as Parameters[0]); + + expect(state(recognition.endpointing)._speaking).toBe(false); + expect(state(recognition.endpointing)._utteranceEndedAt).not.toBeUndefined(); + } finally { + await recognition.close(); + } + }); +}); diff --git a/agents/src/voice/endpointing.ts b/agents/src/voice/endpointing.ts new file mode 100644 index 000000000..1a70bcb49 --- /dev/null +++ b/agents/src/voice/endpointing.ts @@ -0,0 +1,317 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { log } from '../log.js'; +import { ExpFilter } from '../utils.js'; +import type { EndpointingOptions } from './turn_config/endpointing.js'; + +const logger = log(); + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 7-7 lines +const AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD = 250; + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 10-47 lines +export class BaseEndpointing { + protected _minDelay: number; + protected _maxDelay: number; + protected _overlapping: boolean; + + constructor(minDelay: number, maxDelay: number) { + this._minDelay = minDelay; + this._maxDelay = maxDelay; + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 16-22 lines + updateOptions({ minDelay, maxDelay }: { minDelay?: number; maxDelay?: number } = {}): void { + if (minDelay !== undefined) { + this._minDelay = minDelay; + } + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + } + } + + get minDelay(): number { + return this._minDelay; + } + + get maxDelay(): number { + return this._maxDelay; + } + + get overlapping(): boolean { + return this._overlapping; + } + + onStartOfSpeech(startedAt: number, overlapping: boolean = false): void { + void startedAt; + this._overlapping = overlapping; + } + + onEndOfSpeech(endedAt: number, shouldIgnore: boolean = false): void { + void endedAt; + void shouldIgnore; + this._overlapping = false; + } + + onStartOfAgentSpeech(startedAt: number): void { + void startedAt; + } + + onEndOfAgentSpeech(endedAt: number): void { + void endedAt; + } +} + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 49-89 lines +export class DynamicEndpointing extends BaseEndpointing { + private _utterancePause: ExpFilter; + private _turnPause: ExpFilter; + private _utteranceStartedAt?: number; + private _utteranceEndedAt?: number; + private _agentSpeechStartedAt?: number; + private _agentSpeechEndedAt?: number; + private _speaking: boolean; + + constructor(minDelay: number, maxDelay: number, alpha: number = 0.9) { + super(minDelay, maxDelay); + + this._utterancePause = new ExpFilter(alpha, maxDelay, minDelay, minDelay); + this._turnPause = new ExpFilter(alpha, maxDelay, minDelay, maxDelay); + + this._utteranceStartedAt = undefined; + this._utteranceEndedAt = undefined; + this._agentSpeechStartedAt = undefined; + this._agentSpeechEndedAt = undefined; + this._speaking = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 91-102 lines + override get minDelay(): number { + return this._utterancePause.value ?? this._minDelay; + } + + override get maxDelay(): number { + const turnVal = this._turnPause.value ?? this._maxDelay; + return Math.max(turnVal, this.minDelay); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 104-120 lines + get betweenUtteranceDelay(): number { + if (this._utteranceEndedAt === undefined) { + return 0; + } + if (this._utteranceStartedAt === undefined) { + return 0; + } + + return Math.max(0, this._utteranceStartedAt - this._utteranceEndedAt); + } + + get betweenTurnDelay(): number { + if (this._agentSpeechStartedAt === undefined) { + return 0; + } + if (this._utteranceEndedAt === undefined) { + return 0; + } + + return Math.max(0, this._agentSpeechStartedAt - this._utteranceEndedAt); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 122-137 lines + get immediateInterruptionDelay(): [number, number] { + if (this._utteranceStartedAt === undefined) { + return [0, 0]; + } + if (this._agentSpeechStartedAt === undefined) { + return [0, 0]; + } + + return [this.betweenTurnDelay, Math.abs(this.betweenUtteranceDelay - this.betweenTurnDelay)]; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 139-153 lines + override onStartOfAgentSpeech(startedAt: number): void { + this._agentSpeechStartedAt = startedAt; + this._agentSpeechEndedAt = undefined; + this._overlapping = false; + } + + override onEndOfAgentSpeech(endedAt: number): void { + if ( + this._agentSpeechStartedAt !== undefined && + (this._agentSpeechEndedAt === undefined || + this._agentSpeechEndedAt < this._agentSpeechStartedAt) + ) { + this._agentSpeechEndedAt = endedAt; + } + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 155-178 lines + override onStartOfSpeech(startedAt: number, overlapping: boolean = false): void { + if (this._overlapping) { + return; + } + + if ( + this._utteranceStartedAt !== undefined && + this._utteranceEndedAt !== undefined && + this._agentSpeechStartedAt !== undefined && + this._utteranceEndedAt < this._utteranceStartedAt && + overlapping + ) { + this._utteranceEndedAt = this._agentSpeechStartedAt - 1; + logger.trace({ utteranceEndedAt: this._utteranceEndedAt }, 'utterance ended at adjusted'); + } + + this._utteranceStartedAt = startedAt; + this._overlapping = overlapping; + this._speaking = true; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 179-287 lines + override onEndOfSpeech(endedAt: number, shouldIgnore: boolean = false): void { + if (shouldIgnore && this._overlapping) { + if ( + this._utteranceStartedAt !== undefined && + this._agentSpeechStartedAt !== undefined && + Math.abs(this._utteranceStartedAt - this._agentSpeechStartedAt) < + AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD + ) { + logger.trace( + { + delay: Math.abs(this._utteranceStartedAt - this._agentSpeechStartedAt), + gracePeriod: AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD, + }, + 'ignoring shouldIgnore=true within agent speech leading silence grace period', + ); + } else { + this._overlapping = false; + this._speaking = false; + this._utteranceStartedAt = undefined; + this._utteranceEndedAt = undefined; + return; + } + } + + if ( + this._overlapping || + (this._agentSpeechStartedAt !== undefined && this._agentSpeechEndedAt === undefined) + ) { + const [turnDelay, interruptionDelay] = this.immediateInterruptionDelay; + let pause = this.betweenUtteranceDelay; + if ( + 0 < interruptionDelay && + interruptionDelay <= this.minDelay && + 0 < turnDelay && + turnDelay <= this.maxDelay && + pause > 0 + ) { + const prevVal = this.minDelay; + this._utterancePause.apply(1, pause); + logger.debug( + { + previous: prevVal, + minDelay: this.minDelay, + maxDelay: this.maxDelay, + pause, + interruptionDelay, + turnDelay, + reason: 'immediate interruption', + }, + 'min endpointing delay updated', + ); + } else { + pause = this.betweenTurnDelay; + if (pause > 0) { + const prevVal = this.maxDelay; + this._turnPause.apply(1, pause); + logger.debug( + { + previous: prevVal, + minDelay: this.minDelay, + maxDelay: this.maxDelay, + pause, + betweenUtteranceDelay: this.betweenUtteranceDelay, + betweenTurnDelay: this.betweenTurnDelay, + reason: 'new turn (interruption)', + }, + 'max endpointing delay updated', + ); + } + } + } else { + let pause = this.betweenTurnDelay; + if (pause > 0) { + const prevVal = this.maxDelay; + this._turnPause.apply(1, pause); + logger.debug( + { previous: prevVal, minDelay: this.minDelay, maxDelay: this.maxDelay, pause }, + 'max endpointing delay updated due to pause', + ); + } else { + pause = this.betweenUtteranceDelay; + if ( + pause > 0 && + this._agentSpeechEndedAt === undefined && + this._agentSpeechStartedAt === undefined + ) { + const prevVal = this.minDelay; + this._utterancePause.apply(1, pause); + logger.debug( + { previous: prevVal, minDelay: this.minDelay, maxDelay: this.maxDelay, pause }, + 'min endpointing delay updated', + ); + } + } + } + + this._utteranceEndedAt = endedAt; + this._agentSpeechStartedAt = undefined; + this._agentSpeechEndedAt = undefined; + this._speaking = false; + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 288-307 lines + override updateOptions({ + minDelay, + maxDelay, + alpha, + }: { + minDelay?: number; + maxDelay?: number; + alpha?: number; + } = {}): void { + if (minDelay !== undefined) { + this._minDelay = minDelay; + this._utterancePause.reset(undefined, this._minDelay, this._minDelay); + this._turnPause.reset(undefined, undefined, this._minDelay); + } + + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + this._turnPause.reset(undefined, this._maxDelay, undefined, this._maxDelay); + this._utterancePause.reset(undefined, undefined, undefined, this._maxDelay); + } + + if (alpha !== undefined) { + this._utterancePause.reset(alpha); + this._turnPause.reset(alpha); + } + } +} + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 310-322 lines +export function createEndpointing(options: EndpointingOptions): BaseEndpointing { + switch (options.mode) { + case 'dynamic': + return new DynamicEndpointing(options.minDelay, options.maxDelay, options.alpha); + case 'fixed': + default: + return new BaseEndpointing(options.minDelay, options.maxDelay); + } +} diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index b9b3a62e7..03f4d0863 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -22,6 +22,7 @@ export { RoomSessionTransport, } from './remote_session.js'; export * from './events.js'; +export * from './endpointing.js'; export { type TimedString } from './io.js'; export * from './report.js'; export * from './room_io/index.js'; diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index f2603e00f..455fbd906 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -4,10 +4,11 @@ /** * Configuration for endpointing, which determines when the user's turn is complete. */ +// Ref: python livekit-agents/livekit/agents/voice/turn.py - 47-66 lines export interface EndpointingOptions { /** * Endpointing mode. `"fixed"` uses a fixed delay, `"dynamic"` adjusts delay based on - * end-of-utterance prediction. + * speech activity. * @defaultValue "fixed" */ mode: 'fixed' | 'dynamic'; @@ -24,10 +25,17 @@ export interface EndpointingOptions { * @defaultValue 3000 */ maxDelay: number; + /** + * Exponential moving average coefficient for dynamic endpointing. + * @defaultValue 0.9 + */ + alpha: number; } +// Ref: python livekit-agents/livekit/agents/voice/turn.py - 69-74 lines export const defaultEndpointingOptions = { mode: 'fixed', minDelay: 500, maxDelay: 3000, + alpha: 0.9, } as const satisfies EndpointingOptions; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 90010c2c6..f15bd3143 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -46,6 +46,20 @@ describe('migrateLegacyOptions', () => { expect(result.turnHandling.endpointing!.maxDelay).toBe(5000); }); + it('should preserve dynamic endpointing alpha from turnHandling config', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + turnHandling: { + endpointing: { + mode: 'dynamic', + alpha: 0.7, + }, + }, + }); + + expect(result.turnHandling.endpointing!.mode).toBe('dynamic'); + expect(result.turnHandling.endpointing!.alpha).toBe(0.7); + }); + it('should set interruption.enabled to false when allowInterruptions is false', () => { const { agentSessionOptions: result } = migrateLegacyOptions({ voiceOptions: { allowInterruptions: false }, From 58915aa40e4d5247866d3d9ba2da40f3cfba2da6 Mon Sep 17 00:00:00 2001 From: Rosetta Bot Date: Thu, 30 Apr 2026 09:23:31 +0000 Subject: [PATCH 2/3] feat(voice): add dynamic endpointing --- .changeset/dynamic-endpointing-node-port.md | 5 + agents/src/utils.ts | 98 ++- agents/src/voice/agent_activity.ts | 117 ++-- agents/src/voice/audio_recognition.ts | 67 ++- .../voice/audio_recognition_handoff.test.ts | 4 +- .../src/voice/audio_recognition_span.test.ts | 7 +- agents/src/voice/endpointing.test.ts | 566 ++++++++++++++++++ agents/src/voice/endpointing.ts | 269 +++++++++ agents/src/voice/index.ts | 1 + agents/src/voice/turn_config/endpointing.ts | 11 +- 10 files changed, 1051 insertions(+), 94 deletions(-) create mode 100644 .changeset/dynamic-endpointing-node-port.md create mode 100644 agents/src/voice/endpointing.test.ts create mode 100644 agents/src/voice/endpointing.ts diff --git a/.changeset/dynamic-endpointing-node-port.md b/.changeset/dynamic-endpointing-node-port.md new file mode 100644 index 000000000..58278fb9e --- /dev/null +++ b/.changeset/dynamic-endpointing-node-port.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": patch +--- + +feat: Add dynamic endpointing for voice turn handling diff --git a/agents/src/utils.ts b/agents/src/utils.ts index 82c623a6c..3bbd6d223 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -352,43 +352,97 @@ export class AsyncIterableQueue implements AsyncIterableIterator { /** @internal */ export class ExpFilter { - #alpha: number; - #max?: number; - #filtered?: number = undefined; + private _alpha: number; + private _filtered: number | undefined; + private _maxVal: number | undefined; + private _minVal: number | undefined; + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 5-20 lines + constructor(alpha: number, maxVal?: number, minVal?: number, initial?: number) { + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } - constructor(alpha: number, max?: number) { - this.#alpha = alpha; - this.#max = max; + this._alpha = alpha; + this._filtered = initial; + this._maxVal = maxVal; + this._minVal = minVal; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 21-37 lines + reset({ + alpha, + initial, + minVal, + maxVal, + }: { + alpha?: number; + initial?: number; + minVal?: number; + maxVal?: number; + } = {}): void { + if (alpha !== undefined) { + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } + this._alpha = alpha; + } + if (initial !== undefined) { + this._filtered = initial; + } + if (minVal !== undefined) { + this._minVal = minVal; + } + if (maxVal !== undefined) { + this._maxVal = maxVal; + } } - reset(alpha?: number) { - if (alpha) { - this.#alpha = alpha; + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 38-57 lines + apply(exp: number, sample?: number): number { + sample ??= this._filtered; + + if (sample !== undefined && this._filtered === undefined) { + this._filtered = sample; + } else if (sample !== undefined && this._filtered !== undefined) { + const a = this._alpha ** exp; + this._filtered = a * this._filtered + (1 - a) * sample; } - this.#filtered = undefined; - } - apply(exp: number, sample: number): number { - if (this.#filtered) { - const a = this.#alpha ** exp; - this.#filtered = a * this.#filtered + (1 - a) * sample; - } else { - this.#filtered = sample; + if (this._filtered === undefined) { + throw new Error('sample or initial value must be given.'); + } + + if (this._maxVal !== undefined && this._filtered > this._maxVal) { + this._filtered = this._maxVal; } - if (this.#max && this.#filtered > this.#max) { - this.#filtered = this.#max; + if (this._minVal !== undefined && this._filtered < this._minVal) { + this._filtered = this._minVal; } - return this.#filtered; + return this._filtered; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 59-61 lines + get value(): number | undefined { + return this._filtered; } get filtered(): number | undefined { - return this.#filtered; + return this.value; } set alpha(alpha: number) { - this.#alpha = alpha; + if (!(alpha > 0 && alpha <= 1)) { + throw new Error('alpha must be in (0, 1].'); + } + this._alpha = alpha; + } + + // Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 63-64 lines + updateBase(alpha: number): void { + this._alpha = alpha; } } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index ab329d7f0..c59a08983 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -65,6 +65,7 @@ import { type RecognitionHooks, type STTPipeline, } from './audio_recognition.js'; +import { createEndpointing } from './endpointing.js'; import type { AgentState } from './events.js'; import { AgentSessionEventTypes, @@ -90,6 +91,8 @@ import { } from './generation.js'; import type { TimedString } from './io.js'; import { SpeechHandle } from './speech_handle.js'; +import type { EndpointingOptions } from './turn_config/endpointing.js'; +import { stripUndefined } from './turn_config/utils.js'; import { setParticipantSpanAttributes } from './utils.js'; export const agentActivityStorage = new AsyncLocalStorage(); @@ -195,6 +198,7 @@ export class AgentActivity implements RecognitionHooks { private isInterruptionDetectionEnabled: boolean; private isInterruptionByAudioActivityEnabled: boolean; private isDefaultInterruptionByAudioActivityEnabled: boolean; + private interruptionDetected = false; // for false interruption handling private pausedSpeech?: PausedSpeechInfo; @@ -218,6 +222,8 @@ export class AgentActivity implements RecognitionHooks { this.onError(ev); private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1503-1508 lines + this.interruptionDetected = ev.isInterruption; this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev); }; @@ -489,12 +495,8 @@ export class AgentActivity implements RecognitionHooks { turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, interruptionDetection: this.interruptionDetector, - minEndpointingDelay: - this.agent.turnHandling?.endpointing?.minDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.minDelay, - maxEndpointingDelay: - this.agent.turnHandling?.endpointing?.maxDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 779-789 lines + endpointing: createEndpointing(this.endpointingOptions), rootSpanContext: this.agentSession.rootSpanContext, sttModel: this.stt?.label, sttProvider: this.getSttProvider(), @@ -673,19 +675,13 @@ export class AgentActivity implements RecognitionHooks { return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling; } - // get minEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.minDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay - // ); - // } - - // get maxEndpointingDelay(): number { - // return ( - // this.agent.turnHandling?.endpointing?.maxDelay ?? - // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay - // ); - // } + private get endpointingOptions(): EndpointingOptions { + const agentEndpointing = this.agent.turnHandling?.endpointing; + return { + ...this.agentSession.sessionOptions.turnHandling.endpointing, + ...(agentEndpointing ? stripUndefined(agentEndpointing) : {}), + }; + } get toolCtx(): ToolContext { return this.agent.toolCtx; @@ -730,9 +726,11 @@ export class AgentActivity implements RecognitionHooks { updateOptions({ toolChoice, turnDetection, + endpointing, }: { toolChoice?: ToolChoice | null; turnDetection?: TurnDetectionMode; + endpointing?: EndpointingOptions; }): void { if (toolChoice !== undefined) { this.toolChoice = toolChoice; @@ -755,7 +753,11 @@ export class AgentActivity implements RecognitionHooks { } if (this.audioRecognition) { - this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode }); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 487-493 lines + this.audioRecognition.updateOptions({ + endpointing: endpointing !== undefined ? createEndpointing(endpointing) : undefined, + turnDetection: this.turnDetectionMode, + }); } } @@ -934,12 +936,9 @@ export class AgentActivity implements RecognitionHooks { if (!this.vad) { this.agentSession._updateUserState('speaking'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfOverlapSpeech( - 0, - Date.now(), - this.agentSession._userSpeakingSpan, - ); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1510-1517 lines + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); } } @@ -959,8 +958,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1528-1535 lines + this.audioRecognition.onEndOfSpeech(Date.now(), this.agentSession._userSpeakingSpan); } this.agentSession._updateUserState('listening'); } @@ -1042,14 +1042,17 @@ export class AgentActivity implements RecognitionHooks { lastSpeakingTime: speechStartTime, otelContext: otelContext.active(), }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { // Pass speechStartTime as the absolute startedAt timestamp. - this.audioRecognition.onStartOfOverlapSpeech( - ev.speechDuration, + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1662-1673 lines + this.audioRecognition.onStartOfSpeech( speechStartTime, + ev.speechDuration, this.agentSession._userSpeakingSpan, ); } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1674-1676 lines + this.interruptionDetected = false; if (this.falseInterruptionTimer) { // cancel the timer when user starts speaking but leave the paused state unchanged @@ -1080,11 +1083,13 @@ export class AgentActivity implements RecognitionHooks { // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency. speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration; } - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { // Pass speechEndTime as the absolute endedAt timestamp. - this.audioRecognition.onEndOfOverlapSpeech( + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1698-1712 lines + this.audioRecognition.onEndOfSpeech( speechEndTime, this.agentSession._userSpeakingSpan, + this.isInterruptionDetectionEnabled ? this.interruptionDetected : undefined, ); } this.agentSession._updateUserState('listening', { @@ -1167,15 +1172,12 @@ export class AgentActivity implements RecognitionHooks { const audioOutput = this.agentSession.output.audio; if ( - this.isInterruptionDetectionEnabled && this.audioRecognition && + !this.audioRecognition.endpointingOverlapping && this.agentSession.agentState === 'speaking' ) { - this.audioRecognition.onStartOfOverlapSpeech( - 0, - Date.now(), - this.agentSession._userSpeakingSpan, - ); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1631-1639 lines + this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan); } this.updatePausedSpeech(this._currentSpeech, timeout); @@ -1943,8 +1945,11 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2223-2231 lines + this.audioRecognition.onStartOfAgentSpeech(replyStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -2036,10 +2041,12 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); } - this.restoreInterruptionByAudioActivity(); + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); + } } } @@ -2227,8 +2234,11 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2579-2588 lines + this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; } }; @@ -2390,8 +2400,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { this.restoreInterruptionByAudioActivity(); } } @@ -2433,11 +2445,11 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - { - this.audioRecognition.onEndOfAgentSpeech(Date.now()); - this.restoreInterruptionByAudioActivity(); - } + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); } } @@ -3390,7 +3402,8 @@ export class AgentActivity implements RecognitionHooks { otelContext: this.pausedSpeech.handle._agentTurnContext, }); if (this.audioRecognition && this.pausedSpeech.agentState === 'speaking') { - this.audioRecognition.onStartOfAgentSpeech(); + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 3479-3486 lines + this.audioRecognition.onStartOfAgentSpeech(Date.now()); } if (this.isInterruptionDetectionEnabled) { this.isInterruptionByAudioActivityEnabled = false; diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index b1e97c3e6..f4e07b787 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -35,6 +35,7 @@ import { traceTypes, tracer } from '../telemetry/index.js'; import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; +import type { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; import { setParticipantSpanAttributes } from './utils.js'; @@ -138,10 +139,8 @@ export interface AudioRecognitionOptions { /** Turn detection mode. */ turnDetectionMode?: TurnDetectionMode; interruptionDetection?: AdaptiveInterruptionDetector; - /** Minimum endpointing delay in milliseconds. */ - minEndpointingDelay: number; - /** Maximum endpointing delay in milliseconds. */ - maxEndpointingDelay: number; + /** Endpointing state used to select the end-of-turn delay. */ + endpointing: BaseEndpointing; /** Root span context for tracing. */ rootSpanContext?: Context; /** STT model name for tracing */ @@ -170,8 +169,7 @@ export class AudioRecognition { private vad?: VAD; private turnDetector?: _TurnDetector; private turnDetectionMode?: TurnDetectionMode; - private minEndpointingDelay: number; - private maxEndpointingDelay: number; + private endpointing: BaseEndpointing; private lastLanguage?: LanguageCode; private rootSpanContext?: Context; private sttModel?: string; @@ -228,8 +226,7 @@ export class AudioRecognition { this.vad = opts.vad; this.turnDetector = opts.turnDetector; this.turnDetectionMode = opts.turnDetectionMode; - this.minEndpointingDelay = opts.minEndpointingDelay; - this.maxEndpointingDelay = opts.maxEndpointingDelay; + this.endpointing = opts.endpointing; this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; this.sttModel = opts.sttModel; @@ -278,9 +275,24 @@ export class AudioRecognition { return this._inputStartedAt; } + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1631-1639 lines + get endpointingOverlapping(): boolean { + return this.endpointing.overlapping; + } + /** @internal */ - updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void { - this.turnDetectionMode = options.turnDetection; + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 193-219 lines + updateOptions(options: { + endpointing?: BaseEndpointing; + turnDetection?: TurnDetectionMode | undefined; + }): void { + if (options.endpointing !== undefined) { + this.endpointing = options.endpointing; + } + + if (Object.hasOwn(options, 'turnDetection')) { + this.turnDetectionMode = options.turnDetection; + } } async start(options?: { sttPipeline?: STTPipeline }) { @@ -315,12 +327,19 @@ export class AudioRecognition { this.interruptionStreamChannel = undefined; } - async onStartOfAgentSpeech() { + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 239-244 lines + async onStartOfAgentSpeech(startedAt: number) { this.isAgentSpeaking = true; + this.endpointing.onStartOfAgentSpeech(startedAt); return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); } + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 246-271 lines async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (this.isAgentSpeaking) { + this.endpointing.onEndOfAgentSpeech(Date.now()); + } + if (!this.isInterruptionEnabled) { this.isAgentSpeaking = false; return; @@ -348,6 +367,27 @@ export class AudioRecognition { this.isAgentSpeaking = false; } + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 273-290 lines + async onStartOfSpeech(startedAt: number, speechDuration = 0, userSpeakingSpan?: Span) { + this.endpointing.onStartOfSpeech(startedAt, this.isAgentSpeaking); + if (!this.isInterruptionEnabled || !this.isAgentSpeaking) { + return; + } + return this.onStartOfOverlapSpeech(speechDuration, startedAt, userSpeakingSpan); + } + + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 292-306 lines + async onEndOfSpeech(endedAt: number, userSpeakingSpan?: Span, interruption?: boolean) { + if (this.speaking) { + this.endpointing.onEndOfSpeech( + endedAt, + interruption !== undefined && !interruption && this.isAgentSpeaking, + ); + } + + return this.onEndOfOverlapSpeech(endedAt, userSpeakingSpan); + } + /** Start interruption inference when agent is speaking and overlap speech starts. */ async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) { if (this.isAgentSpeaking) { @@ -825,7 +865,8 @@ export class AudioRecognition { speechStartTime: number | undefined, ) => async (controller: AbortController) => { - let endpointingDelay = this.minEndpointingDelay; + // Ref: python livekit-agents/livekit/agents/voice/audio_recognition.py - 949-973 lines + let endpointingDelay = this.endpointing.minDelay; const userTurnSpan = this.ensureUserTurnSpan(); const userTurnCtx = this.userTurnContext(userTurnSpan); @@ -851,7 +892,7 @@ export class AudioRecognition { ); if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) { - endpointingDelay = this.maxEndpointingDelay; + endpointingDelay = this.endpointing.maxDelay; } } catch (error) { this.logger.error(error, 'Error predicting end of turn'); diff --git a/agents/src/voice/audio_recognition_handoff.test.ts b/agents/src/voice/audio_recognition_handoff.test.ts index 76311ec12..204f50462 100644 --- a/agents/src/voice/audio_recognition_handoff.test.ts +++ b/agents/src/voice/audio_recognition_handoff.test.ts @@ -7,6 +7,7 @@ import { ChatContext } from '../llm/chat_context.js'; import { initializeLogger } from '../log.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { AudioRecognition, type RecognitionHooks, STTPipeline } from './audio_recognition.js'; +import { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; function createHooks() { @@ -45,8 +46,7 @@ function createRecognition(sttNode: STTNode, hooks = createHooks()) { recognition: new AudioRecognition({ recognitionHooks: hooks, stt: sttNode, - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), }), }; } diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts index cfe92a821..56ae77646 100644 --- a/agents/src/voice/audio_recognition_span.test.ts +++ b/agents/src/voice/audio_recognition_span.test.ts @@ -22,6 +22,7 @@ import { type RecognitionHooks, type _TurnDetector, } from './audio_recognition.js'; +import { BaseEndpointing } from './endpointing.js'; import type { STTNode } from './io.js'; function setupInMemoryTracing() { @@ -145,8 +146,7 @@ describe('AudioRecognition user_turn span parity', () => { vad: undefined, turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'stt', - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), sttModel: 'deepgram-nova2', sttProvider: 'deepgram', getLinkedParticipant: () => ({ sid: 'p1', identity: 'bob', kind: ParticipantKind.AGENT }), @@ -254,8 +254,7 @@ describe('AudioRecognition user_turn span parity', () => { vad: new FakeVAD(vadEvents), turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'vad', - minEndpointingDelay: 0, - maxEndpointingDelay: 0, + endpointing: new BaseEndpointing(0, 0), sttModel: 'stt-model', sttProvider: 'stt-provider', getLinkedParticipant: () => ({ sid: 'p2', identity: 'alice', kind: ParticipantKind.AGENT }), diff --git a/agents/src/voice/endpointing.test.ts b/agents/src/voice/endpointing.test.ts new file mode 100644 index 000000000..61b0c5fdd --- /dev/null +++ b/agents/src/voice/endpointing.test.ts @@ -0,0 +1,566 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import { ExpFilter } from '../utils.js'; +import { AgentActivity } from './agent_activity.js'; +import { AudioRecognition, type RecognitionHooks } from './audio_recognition.js'; +import { DynamicEndpointing, createEndpointing } from './endpointing.js'; + +function privateState(value: object): Record { + return value as Record; +} + +function createHooks(): RecognitionHooks { + return { + onInterruption: vi.fn(), + onStartOfSpeech: vi.fn(), + onVADInferenceDone: vi.fn(), + onEndOfSpeech: vi.fn(), + onInterimTranscript: vi.fn(), + onFinalTranscript: vi.fn(), + onEndOfTurn: vi.fn(async () => true), + onPreemptiveGeneration: vi.fn(), + retrieveChatCtx: () => ChatContext.empty(), + }; +} + +describe('TestExponentialMovingAverage', () => { + it('test_initialization_with_valid_alpha', () => { + const ema = new ExpFilter(0.5); + expect(ema.value).toBeUndefined(); + + const emaWithInitial = new ExpFilter(0.5, undefined, undefined, 10); + expect(emaWithInitial.value).toBe(10); + + const emaAlphaOne = new ExpFilter(1.0); + expect(emaAlphaOne.value).toBeUndefined(); + }); + + it('test_initialization_with_invalid_alpha', () => { + expect(() => new ExpFilter(0.0)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(-0.5)).toThrow(/alpha must be in/); + expect(() => new ExpFilter(1.5)).toThrow(/alpha must be in/); + }); + + it('test_update_with_no_initial_value', () => { + const ema = new ExpFilter(0.5); + const result = ema.apply(1, 10); + expect(result).toBe(10); + expect(ema.value).toBe(10); + }); + + it('test_update_with_initial_value', () => { + const ema = new ExpFilter(0.5, undefined, undefined, 10); + const result = ema.apply(1, 20); + expect(result).toBe(15); + expect(ema.value).toBe(15); + }); + + it('test_update_multiple_times', () => { + const ema = new ExpFilter(0.5, undefined, undefined, 10); + ema.apply(1, 20); + ema.apply(1, 20); + expect(ema.value).toBe(17.5); + }); + + it('test_reset', () => { + let ema = new ExpFilter(0.5, undefined, undefined, 10); + expect(ema.value).toBe(10); + ema.reset(); + expect(ema.value).toBe(10); + + ema = new ExpFilter(0.5, undefined, undefined, 10); + ema.reset({ initial: 5 }); + expect(ema.value).toBe(5); + }); +}); + +describe('TestDynamicEndpointing', () => { + it('test_initialization', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_initialization_with_custom_alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.2); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_initialization_uses_updated_default_alpha', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.9, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.9, 5); + }); + + it('test_empty_delays', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(ep.betweenUtteranceDelay).toBe(0); + expect(ep.betweenTurnDelay).toBe(0); + expect(ep.immediateInterruptionDelay).toEqual([0, 0]); + }); + + it('test_on_utterance_ended', () => { + let ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + expect(privateState(ep)._utteranceEndedAt).toBe(100000); + + ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(99900); + expect(privateState(ep)._utteranceEndedAt).toBe(99900); + }); + + it('test_on_utterance_started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfSpeech(100000); + expect(privateState(ep)._utteranceStartedAt).toBe(100000); + }); + + it('test_on_agent_speech_started', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + expect(privateState(ep)._agentSpeechStartedAt).toBe(100000); + }); + + it('test_between_utterance_delay_calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100500); + expect(ep.betweenUtteranceDelay).toBeCloseTo(500, 5); + }); + + it('test_between_turn_delay_calculation', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100800); + expect(ep.betweenTurnDelay).toBeCloseTo(800, 5); + }); + + it('test_pause_between_utterances_updates_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400); + ep.onEndOfSpeech(100500, false); + + const expected = 0.5 * 400 + 0.5 * initialMin; + expect(ep.minDelay).toBeCloseTo(expected, 5); + }); + + it('test_new_turn_updates_max_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100600); + ep.onStartOfSpeech(101500); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 600 + 0.5 * 1000, 5); + }); + + it('test_interruption_updates_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + expect(privateState(ep)._agentSpeechStartedAt).not.toBeUndefined(); + ep.onStartOfSpeech(100250, true); + expect(privateState(ep)._overlapping).toBe(true); + + ep.onEndOfSpeech(100500); + + expect(privateState(ep)._overlapping).toBe(false); + expect(privateState(ep)._agentSpeechStartedAt).toBeUndefined(); + expect(ep.minDelay).toBeCloseTo(300, 5); + }); + + it('test_update_options', () => { + let ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500 }); + expect(ep.minDelay).toBe(500); + expect(privateState(ep)._minDelay).toBe(500); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ maxDelay: 2000 }); + expect(ep.maxDelay).toBe(2000); + expect(privateState(ep)._maxDelay).toBe(2000); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(ep.minDelay).toBe(500); + expect(ep.maxDelay).toBe(2000); + + ep = new DynamicEndpointing(300, 1000); + ep.updateOptions(); + expect(ep.minDelay).toBe(300); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_max_delay_clamped_to_configured_max', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(102000); + ep.onStartOfSpeech(105000); + expect(ep.maxDelay).toBe(1000); + }); + + it('test_max_delay_clamped_to_min_delay', () => { + const ep = new DynamicEndpointing(300, 1000, 1.0); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100100); + ep.onStartOfSpeech(100500); + expect(ep.maxDelay).toBeGreaterThanOrEqual(privateState(ep)._minDelay); + }); + + it('test_non_interruption_clears_agent_speech', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + expect(privateState(ep)._agentSpeechStartedAt).not.toBeUndefined(); + + ep.onStartOfSpeech(102000); + ep.onEndOfSpeech(103000, false); + expect(privateState(ep)._agentSpeechStartedAt).toBeUndefined(); + }); + + it('test_consecutive_interruptions_only_track_first', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(privateState(ep)._overlapping).toBe(true); + const prevVal = [ep.minDelay, ep.maxDelay]; + + ep.onStartOfSpeech(100350); + + expect(privateState(ep)._overlapping).toBe(true); + expect(prevVal).toEqual([ep.minDelay, ep.maxDelay]); + }); + + it('test_delayed_interruption_updates_max_delay_without_crashing', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800); + ep.onEndOfSpeech(102000, false); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('test_interruption_adjusts_stale_utterance_end_time', () => { + const ep = new DynamicEndpointing(60, 1000, 1.0); + ep.onEndOfSpeech(99000); + ep.onStartOfSpeech(100000); + + ep.onStartOfAgentSpeech(100200); + ep.onStartOfSpeech(100250, true); + + expect(Math.abs(Number(privateState(ep)._utteranceEndedAt) - 100200)).toBeLessThanOrEqual(1); + expect(ep.minDelay).toBeCloseTo(60, 5); + expect(ep.maxDelay).toBeCloseTo(1000, 5); + }); + + it('test_update_options_preserves_filter_alpha', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 600, maxDelay: 2000 }); + + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.5, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.5, 5); + }); + + it('test_update_options_updates_alpha_in_place', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + ep.onEndOfSpeech(101000); + const learnedMin = ep.minDelay; + + ep.updateOptions({ alpha: 0.2 }); + + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.2, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.2, 5); + expect(ep.minDelay).toBeCloseTo(learnedMin, 5); + }); + + it('test_update_options_updates_filter_clamp_bounds', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.updateOptions({ minDelay: 500, maxDelay: 2000 }); + expect(privateState(privateState(ep)._utterancePause)._minVal).toBe(500); + expect(privateState(privateState(ep)._turnPause)._maxVal).toBe(2000); + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100200); + expect(ep.minDelay).toBeCloseTo(500, 5); + + ep.onEndOfSpeech(101000); + ep.onStartOfAgentSpeech(102800); + ep.onStartOfSpeech(103500); + expect(ep.maxDelay).toBeGreaterThan(1000); + expect(ep.maxDelay).toBeLessThanOrEqual(2000); + }); + + it('test_should_ignore_skips_filter_update', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101500, true); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + + ep.onEndOfSpeech(101800, true); + + expect(ep.minDelay).toBe(prevMin); + expect(ep.maxDelay).toBe(prevMax); + expect(privateState(ep)._utteranceStartedAt).toBeUndefined(); + expect(privateState(ep)._utteranceEndedAt).toBeUndefined(); + expect(privateState(ep)._overlapping).toBe(false); + expect(privateState(ep)._speaking).toBe(false); + }); + + it('test_should_ignore_without_overlapping_still_updates', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + const initialMin = ep.minDelay; + + ep.onEndOfSpeech(100000); + ep.onStartOfSpeech(100400, false); + ep.onEndOfSpeech(100600, true); + + const expected = 0.5 * 400 + 0.5 * initialMin; + expect(ep.minDelay).toBeCloseTo(expected, 5); + }); + + it('test_should_ignore_grace_period_overrides', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(100600, true); + + ep.onEndOfSpeech(100800, true); + + expect(privateState(ep)._utteranceEndedAt).toBe(100800); + expect(privateState(ep)._speaking).toBe(false); + }); + + it('test_should_ignore_outside_grace_period', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100500); + ep.onStartOfSpeech(101000, true); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + ep.onEndOfSpeech(101500, true); + + expect(ep.minDelay).toBe(prevMin); + expect(ep.maxDelay).toBe(prevMax); + expect(privateState(ep)._utteranceStartedAt).toBeUndefined(); + expect(privateState(ep)._utteranceEndedAt).toBeUndefined(); + }); + + it('test_on_end_of_agent_speech_clears_state', () => { + const ep = new DynamicEndpointing(300, 1000); + ep.onStartOfAgentSpeech(100000); + ep.onStartOfSpeech(100100, true); + expect(privateState(ep)._overlapping).toBe(true); + expect(privateState(ep)._agentSpeechStartedAt).toBe(100000); + + ep.onEndOfAgentSpeech(101000); + + expect(privateState(ep)._agentSpeechEndedAt).toBe(101000); + expect(privateState(ep)._agentSpeechStartedAt).toBe(100000); + expect(privateState(ep)._overlapping).toBe(false); + }); + + it('test_overlapping_inferred_from_agent_speech', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + ep.onEndOfSpeech(100000); + ep.onStartOfAgentSpeech(100900); + ep.onStartOfSpeech(101800, false); + ep.onEndOfSpeech(102000); + expect(ep.maxDelay).toBeCloseTo(0.5 * 900 + 0.5 * 1000, 5); + }); + + it('test_speaking_flag_set_and_cleared', () => { + const ep = new DynamicEndpointing(300, 1000); + expect(privateState(ep)._speaking).toBe(false); + ep.onStartOfSpeech(100000); + expect(privateState(ep)._speaking).toBe(true); + ep.onEndOfSpeech(100500); + expect(privateState(ep)._speaking).toBe(false); + }); + + it.each([ + ['no_agent/no_overlap/no_ignore', 'none', false, false, false, true, false], + ['no_agent/no_overlap/ignore', 'none', false, true, false, true, false], + ['agent_ended/no_overlap/no_ignore', 'ended', false, false, false, false, true], + ['agent_ended/no_overlap/ignore', 'ended', false, true, false, false, true], + ['agent_active/no_overlap/no_ignore', 'active', false, false, false, false, true], + ['agent_active/no_overlap/ignore', 'active', false, true, false, false, true], + ['agent_active/overlap/no_ignore', 'active', true, false, false, true, false], + ['agent_active/overlap/ignore/outside_grace', 'active', true, true, false, false, false], + ['agent_active/overlap/ignore/inside_grace', 'active', true, true, true, true, false], + ] as const)( + 'test_all_overlapping_and_should_ignore_combos %s', + ( + label, + agentSpeech, + overlapping, + shouldIgnore, + withinGrace, + expectMinChange, + expectMaxChange, + ) => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(99000); + ep.onEndOfSpeech(100000); + + let userStart: number; + if (agentSpeech === 'ended') { + ep.onStartOfAgentSpeech(100500); + ep.onEndOfAgentSpeech(101000); + userStart = 101500; + } else if (agentSpeech === 'active') { + if (withinGrace) { + ep.onStartOfAgentSpeech(100150); + userStart = 100350; + } else if (overlapping && shouldIgnore) { + ep.onStartOfAgentSpeech(100200); + userStart = 101500; + } else if (overlapping) { + ep.onStartOfAgentSpeech(100150); + userStart = 100400; + } else { + ep.onStartOfAgentSpeech(100900); + userStart = 101800; + } + } else { + userStart = 100400; + } + + ep.onStartOfSpeech(userStart, overlapping); + + const prevMin = ep.minDelay; + const prevMax = ep.maxDelay; + + ep.onEndOfSpeech(userStart + 500, shouldIgnore); + + const minChanged = ep.minDelay !== prevMin; + const maxChanged = ep.maxDelay !== prevMax; + + expect(minChanged, `[${label}] min_delay change`).toBe(expectMinChange); + expect(maxChanged, `[${label}] max_delay change`).toBe(expectMaxChange); + expect(privateState(ep)._speaking, `[${label}] _speaking should be false`).toBe(false); + expect(privateState(ep)._overlapping, `[${label}] _overlapping should be false`).toBe(false); + }, + ); + + it('test_full_conversation_sequence', () => { + const ep = new DynamicEndpointing(300, 1000, 0.5); + + ep.onStartOfSpeech(100000); + ep.onEndOfSpeech(101000); + + ep.onStartOfAgentSpeech(101500); + + ep.onStartOfSpeech(102500, true); + const minBeforeBackchannel = ep.minDelay; + const maxBeforeBackchannel = ep.maxDelay; + ep.onEndOfSpeech(102800, true); + + expect(ep.minDelay).toBe(minBeforeBackchannel); + expect(ep.maxDelay).toBe(maxBeforeBackchannel); + + ep.onEndOfAgentSpeech(103000); + + ep.onStartOfSpeech(103500); + ep.onEndOfSpeech(104000); + + expect(privateState(ep)._speaking).toBe(false); + expect(privateState(ep)._agentSpeechStartedAt).toBeUndefined(); + }); +}); + +describe('TestCreateEndpointing', () => { + it('test_dynamic_mode_wires_alpha', () => { + const ep = createEndpointing({ mode: 'dynamic', minDelay: 300, maxDelay: 1000, alpha: 0.7 }); + + expect(ep).toBeInstanceOf(DynamicEndpointing); + expect(privateState(privateState(ep)._utterancePause)._alpha).toBeCloseTo(0.7, 5); + expect(privateState(privateState(ep)._turnPause)._alpha).toBeCloseTo(0.7, 5); + }); + + it('test_fixed_mode_returns_base_endpointing', () => { + const ep = createEndpointing({ mode: 'fixed', minDelay: 500, maxDelay: 3000, alpha: 0.9 }); + + expect(ep).not.toBeInstanceOf(DynamicEndpointing); + expect(ep.minDelay).toBe(500); + expect(ep.maxDelay).toBe(3000); + }); +}); + +describe('AudioRecognition dynamic endpointing integration', () => { + it('forwards speech lifecycle to endpointing with explicit timestamps', async () => { + const endpointing = new DynamicEndpointing(300, 1000, 0.5); + const recognition = new AudioRecognition({ recognitionHooks: createHooks(), endpointing }); + + await recognition.onStartOfSpeech(99000); + privateState(recognition).speaking = true; + await recognition.onEndOfSpeech(100000); + + await recognition.onStartOfAgentSpeech(100150); + await recognition.onStartOfSpeech(100350, 0); + privateState(recognition).speaking = true; + await recognition.onEndOfSpeech(100800, undefined, true); + + expect(endpointing.minDelay).toBeCloseTo(0.5 * 350 + 0.5 * 300, 5); + }); + + it('updateOptions replaces endpointing state instead of mutating learned history', () => { + const first = new DynamicEndpointing(300, 1000, 0.5); + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + endpointing: first, + }); + + first.onEndOfSpeech(100000); + first.onStartOfSpeech(100400); + first.onEndOfSpeech(100900); + expect(first.minDelay).toBeGreaterThan(300); + + const replacement = createEndpointing({ + mode: 'dynamic', + minDelay: 500, + maxDelay: 2000, + alpha: 0.2, + }); + recognition.updateOptions({ endpointing: replacement }); + + const endpointingState = privateState(recognition).endpointing as DynamicEndpointing; + expect(endpointingState).toBe(replacement); + expect(endpointingState.minDelay).toBe(500); + expect(endpointingState.maxDelay).toBe(2000); + }); + + it('agent activity updateOptions recreates endpointing state for active recognition', () => { + const updateOptions = vi.fn(); + const activity = Object.create(AgentActivity.prototype) as { + updateOptions: AgentActivity['updateOptions']; + audioRecognition?: { updateOptions: typeof updateOptions }; + turnDetectionMode?: 'vad'; + }; + activity.audioRecognition = { updateOptions }; + activity.turnDetectionMode = 'vad'; + + activity.updateOptions({ + endpointing: { mode: 'dynamic', minDelay: 500, maxDelay: 2000, alpha: 0.2 }, + }); + + const endpointingState = updateOptions.mock.calls[0][0].endpointing as DynamicEndpointing; + expect(endpointingState).toBeInstanceOf(DynamicEndpointing); + expect(endpointingState.minDelay).toBe(500); + expect(endpointingState.maxDelay).toBe(2000); + }); +}); diff --git a/agents/src/voice/endpointing.ts b/agents/src/voice/endpointing.ts new file mode 100644 index 000000000..73ac819e4 --- /dev/null +++ b/agents/src/voice/endpointing.ts @@ -0,0 +1,269 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ExpFilter } from '../utils.js'; +import type { EndpointingOptions } from './turn_config/endpointing.js'; + +export type { EndpointingOptions } from './turn_config/endpointing.js'; + +const AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD = 250; + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 10-47 lines +export class BaseEndpointing { + protected _minDelay: number; + protected _maxDelay: number; + protected _overlapping = false; + + constructor(minDelay: number, maxDelay: number) { + this._minDelay = minDelay; + this._maxDelay = maxDelay; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 16-22 lines + updateOptions({ + minDelay, + maxDelay, + }: { + minDelay?: number; + maxDelay?: number; + } = {}): void { + if (minDelay !== undefined) { + this._minDelay = minDelay; + } + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + } + } + + get minDelay(): number { + return this._minDelay; + } + + get maxDelay(): number { + return this._maxDelay; + } + + get overlapping(): boolean { + return this._overlapping; + } + + onStartOfSpeech(startedAt: number, overlapping = false): void { + void startedAt; + this._overlapping = overlapping; + } + + onEndOfSpeech(endedAt: number, shouldIgnore = false): void { + void endedAt; + void shouldIgnore; + this._overlapping = false; + } + + onStartOfAgentSpeech(startedAt: number): void { + void startedAt; + } + + onEndOfAgentSpeech(endedAt: number): void { + void endedAt; + } +} + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 49-89 lines +export class DynamicEndpointing extends BaseEndpointing { + private _utterancePause: ExpFilter; + private _turnPause: ExpFilter; + private _utteranceStartedAt: number | undefined; + private _utteranceEndedAt: number | undefined; + private _agentSpeechStartedAt: number | undefined; + private _agentSpeechEndedAt: number | undefined; + private _speaking = false; + + constructor(minDelay: number, maxDelay: number, alpha = 0.9) { + super(minDelay, maxDelay); + + this._utterancePause = new ExpFilter(alpha, maxDelay, minDelay, minDelay); + this._turnPause = new ExpFilter(alpha, maxDelay, minDelay, maxDelay); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 91-102 lines + override get minDelay(): number { + return this._utterancePause.value ?? this._minDelay; + } + + override get maxDelay(): number { + const turnVal = this._turnPause.value ?? this._maxDelay; + return Math.max(turnVal, this.minDelay); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 104-120 lines + get betweenUtteranceDelay(): number { + if (this._utteranceEndedAt === undefined) { + return 0; + } + if (this._utteranceStartedAt === undefined) { + return 0; + } + + return Math.max(0, this._utteranceStartedAt - this._utteranceEndedAt); + } + + get betweenTurnDelay(): number { + if (this._agentSpeechStartedAt === undefined) { + return 0; + } + if (this._utteranceEndedAt === undefined) { + return 0; + } + + return Math.max(0, this._agentSpeechStartedAt - this._utteranceEndedAt); + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 122-137 lines + get immediateInterruptionDelay(): [number, number] { + if (this._utteranceStartedAt === undefined) { + return [0, 0]; + } + if (this._agentSpeechStartedAt === undefined) { + return [0, 0]; + } + + return [this.betweenTurnDelay, Math.abs(this.betweenUtteranceDelay - this.betweenTurnDelay)]; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 139-153 lines + override onStartOfAgentSpeech(startedAt: number): void { + this._agentSpeechStartedAt = startedAt; + this._agentSpeechEndedAt = undefined; + this._overlapping = false; + } + + override onEndOfAgentSpeech(endedAt: number): void { + if ( + this._agentSpeechStartedAt !== undefined && + (this._agentSpeechEndedAt === undefined || + this._agentSpeechEndedAt < this._agentSpeechStartedAt) + ) { + this._agentSpeechEndedAt = endedAt; + } + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 155-178 lines + override onStartOfSpeech(startedAt: number, overlapping = false): void { + if (this._overlapping) { + return; + } + + if ( + this._utteranceStartedAt !== undefined && + this._utteranceEndedAt !== undefined && + this._agentSpeechStartedAt !== undefined && + this._utteranceEndedAt < this._utteranceStartedAt && + overlapping + ) { + this._utteranceEndedAt = this._agentSpeechStartedAt - 1; + } + + this._utteranceStartedAt = startedAt; + this._overlapping = overlapping; + this._speaking = true; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 179-286 lines + override onEndOfSpeech(endedAt: number, shouldIgnore = false): void { + if (shouldIgnore && this._overlapping) { + const withinGracePeriod = + this._utteranceStartedAt !== undefined && + this._agentSpeechStartedAt !== undefined && + Math.abs(this._utteranceStartedAt - this._agentSpeechStartedAt) < + AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD; + if (!withinGracePeriod) { + this._overlapping = false; + this._speaking = false; + this._utteranceStartedAt = undefined; + this._utteranceEndedAt = undefined; + return; + } + } + + if ( + this._overlapping || + (this._agentSpeechStartedAt !== undefined && this._agentSpeechEndedAt === undefined) + ) { + const [turnDelay, interruptionDelay] = this.immediateInterruptionDelay; + const utterancePause = this.betweenUtteranceDelay; + if ( + 0 < interruptionDelay && + interruptionDelay <= this.minDelay && + 0 < turnDelay && + turnDelay <= this.maxDelay && + utterancePause > 0 + ) { + this._utterancePause.apply(1, utterancePause); + } else { + const turnPause = this.betweenTurnDelay; + if (turnPause > 0) { + this._turnPause.apply(1, turnPause); + } + } + } else { + const turnPause = this.betweenTurnDelay; + if (turnPause > 0) { + this._turnPause.apply(1, turnPause); + } else { + const utterancePause = this.betweenUtteranceDelay; + if ( + utterancePause > 0 && + this._agentSpeechEndedAt === undefined && + this._agentSpeechStartedAt === undefined + ) { + this._utterancePause.apply(1, utterancePause); + } + } + } + + this._utteranceEndedAt = endedAt; + this._agentSpeechStartedAt = undefined; + this._agentSpeechEndedAt = undefined; + this._speaking = false; + this._overlapping = false; + } + + // Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 288-307 lines + override updateOptions({ + minDelay, + maxDelay, + alpha, + }: { + minDelay?: number; + maxDelay?: number; + alpha?: number; + } = {}): void { + if (minDelay !== undefined) { + this._minDelay = minDelay; + this._utterancePause.reset({ initial: this._minDelay, minVal: this._minDelay }); + this._turnPause.reset({ minVal: this._minDelay }); + } + + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + this._turnPause.reset({ initial: this._maxDelay, maxVal: this._maxDelay }); + this._utterancePause.reset({ maxVal: this._maxDelay }); + } + + if (alpha !== undefined) { + this._utterancePause.reset({ alpha }); + this._turnPause.reset({ alpha }); + } + } +} + +// Ref: python livekit-agents/livekit/agents/voice/endpointing.py - 310-322 lines +export function createEndpointing(options: EndpointingOptions): BaseEndpointing { + switch (options.mode) { + case 'dynamic': + return new DynamicEndpointing(options.minDelay, options.maxDelay, options.alpha); + case 'fixed': + default: + return new BaseEndpointing(options.minDelay, options.maxDelay); + } +} diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index 808ac88c1..c3791e80a 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -11,6 +11,7 @@ export { } from './agent_session.js'; export * from './avatar/index.js'; export * from './background_audio.js'; +export * from './endpointing.js'; export { type TextInputCallback, type TextInputEvent, diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index f2603e00f..a4924647d 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -4,10 +4,11 @@ /** * Configuration for endpointing, which determines when the user's turn is complete. */ +// Ref: python livekit-agents/livekit/agents/voice/turn.py - 47-66 lines export interface EndpointingOptions { /** * Endpointing mode. `"fixed"` uses a fixed delay, `"dynamic"` adjusts delay based on - * end-of-utterance prediction. + * observed speech and turn pauses. * @defaultValue "fixed" */ mode: 'fixed' | 'dynamic'; @@ -24,10 +25,18 @@ export interface EndpointingOptions { * @defaultValue 3000 */ maxDelay: number; + /** + * Exponential moving average coefficient for dynamic endpointing. Higher values give more + * weight to previous pause history. + * @defaultValue 0.9 + */ + alpha: number; } +// Ref: python livekit-agents/livekit/agents/voice/turn.py - 69-74 lines export const defaultEndpointingOptions = { mode: 'fixed', minDelay: 500, maxDelay: 3000, + alpha: 0.9, } as const satisfies EndpointingOptions; From 28f4964df97e044813af035cd397510eb1dae56d Mon Sep 17 00:00:00 2001 From: Rosetta Bot Date: Thu, 30 Apr 2026 09:26:23 +0000 Subject: [PATCH 3/3] test(voice): cover dynamic endpointing config merge --- agents/src/voice/turn_config/utils.test.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 90010c2c6..f15bd3143 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -46,6 +46,20 @@ describe('migrateLegacyOptions', () => { expect(result.turnHandling.endpointing!.maxDelay).toBe(5000); }); + it('should preserve dynamic endpointing alpha from turnHandling config', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + turnHandling: { + endpointing: { + mode: 'dynamic', + alpha: 0.7, + }, + }, + }); + + expect(result.turnHandling.endpointing!.mode).toBe('dynamic'); + expect(result.turnHandling.endpointing!.alpha).toBe(0.7); + }); + it('should set interruption.enabled to false when allowInterruptions is false', () => { const { agentSessionOptions: result } = migrateLegacyOptions({ voiceOptions: { allowInterruptions: false },