Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/dynamic-endpointing-node-port.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@livekit/agents": patch
---

feat: Add dynamic endpointing for voice turn handling
98 changes: 76 additions & 22 deletions agents/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -352,43 +352,97 @@ export class AsyncIterableQueue<T> implements AsyncIterableIterator<T> {

/** @internal */
export class ExpFilter {
#alpha: number;
#max?: number;
#filtered?: number = undefined;
private _alpha: number;
private _filtered: number | undefined;
private _maxVal: number | undefined;
private _minVal: number | undefined;

// Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 5-20 lines
constructor(alpha: number, maxVal?: number, minVal?: number, initial?: number) {
if (!(alpha > 0 && alpha <= 1)) {
throw new Error('alpha must be in (0, 1].');
}

constructor(alpha: number, max?: number) {
this.#alpha = alpha;
this.#max = max;
this._alpha = alpha;
this._filtered = initial;
this._maxVal = maxVal;
this._minVal = minVal;
}

// Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 21-37 lines
reset({
alpha,
initial,
minVal,
maxVal,
}: {
alpha?: number;
initial?: number;
minVal?: number;
maxVal?: number;
} = {}): void {
if (alpha !== undefined) {
if (!(alpha > 0 && alpha <= 1)) {
throw new Error('alpha must be in (0, 1].');
}
this._alpha = alpha;
}
if (initial !== undefined) {
this._filtered = initial;
}
if (minVal !== undefined) {
this._minVal = minVal;
}
if (maxVal !== undefined) {
this._maxVal = maxVal;
}
}

reset(alpha?: number) {
if (alpha) {
this.#alpha = alpha;
// Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 38-57 lines
apply(exp: number, sample?: number): number {
sample ??= this._filtered;

if (sample !== undefined && this._filtered === undefined) {
this._filtered = sample;
} else if (sample !== undefined && this._filtered !== undefined) {
const a = this._alpha ** exp;
this._filtered = a * this._filtered + (1 - a) * sample;
}
this.#filtered = undefined;
}

apply(exp: number, sample: number): number {
if (this.#filtered) {
const a = this.#alpha ** exp;
this.#filtered = a * this.#filtered + (1 - a) * sample;
} else {
this.#filtered = sample;
if (this._filtered === undefined) {
throw new Error('sample or initial value must be given.');
}

if (this._maxVal !== undefined && this._filtered > this._maxVal) {
this._filtered = this._maxVal;
}

if (this.#max && this.#filtered > this.#max) {
this.#filtered = this.#max;
if (this._minVal !== undefined && this._filtered < this._minVal) {
this._filtered = this._minVal;
}

return this.#filtered;
return this._filtered;
}

// Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 59-61 lines
get value(): number | undefined {
return this._filtered;
}

get filtered(): number | undefined {
return this.#filtered;
return this.value;
}

set alpha(alpha: number) {
this.#alpha = alpha;
if (!(alpha > 0 && alpha <= 1)) {
throw new Error('alpha must be in (0, 1].');
}
this._alpha = alpha;
}

// Ref: python livekit-agents/livekit/agents/utils/exp_filter.py - 63-64 lines
updateBase(alpha: number): void {
this._alpha = alpha;
}
}

Expand Down
117 changes: 65 additions & 52 deletions agents/src/voice/agent_activity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ import {
type RecognitionHooks,
type STTPipeline,
} from './audio_recognition.js';
import { createEndpointing } from './endpointing.js';
import type { AgentState } from './events.js';
import {
AgentSessionEventTypes,
Expand All @@ -90,6 +91,8 @@ import {
} from './generation.js';
import type { TimedString } from './io.js';
import { SpeechHandle } from './speech_handle.js';
import type { EndpointingOptions } from './turn_config/endpointing.js';
import { stripUndefined } from './turn_config/utils.js';
import { setParticipantSpanAttributes } from './utils.js';

export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
Expand Down Expand Up @@ -195,6 +198,7 @@ export class AgentActivity implements RecognitionHooks {
private isInterruptionDetectionEnabled: boolean;
private isInterruptionByAudioActivityEnabled: boolean;
private isDefaultInterruptionByAudioActivityEnabled: boolean;
private interruptionDetected = false;

// for false interruption handling
private pausedSpeech?: PausedSpeechInfo;
Expand All @@ -218,6 +222,8 @@ export class AgentActivity implements RecognitionHooks {
this.onError(ev);

private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => {
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1503-1508 lines
this.interruptionDetected = ev.isInterruption;
this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev);
};

Expand Down Expand Up @@ -489,12 +495,8 @@ export class AgentActivity implements RecognitionHooks {
turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection,
turnDetectionMode: this.turnDetectionMode,
interruptionDetection: this.interruptionDetector,
minEndpointingDelay:
this.agent.turnHandling?.endpointing?.minDelay ??
this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
maxEndpointingDelay:
this.agent.turnHandling?.endpointing?.maxDelay ??
this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 779-789 lines
endpointing: createEndpointing(this.endpointingOptions),
rootSpanContext: this.agentSession.rootSpanContext,
sttModel: this.stt?.label,
sttProvider: this.getSttProvider(),
Expand Down Expand Up @@ -673,19 +675,13 @@ export class AgentActivity implements RecognitionHooks {
return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling;
}

// get minEndpointingDelay(): number {
// return (
// this.agent.turnHandling?.endpointing?.minDelay ??
// this.agentSession.sessionOptions.turnHandling.endpointing.minDelay
// );
// }

// get maxEndpointingDelay(): number {
// return (
// this.agent.turnHandling?.endpointing?.maxDelay ??
// this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay
// );
// }
private get endpointingOptions(): EndpointingOptions {
const agentEndpointing = this.agent.turnHandling?.endpointing;
return {
...this.agentSession.sessionOptions.turnHandling.endpointing,
...(agentEndpointing ? stripUndefined(agentEndpointing) : {}),
};
}

get toolCtx(): ToolContext {
return this.agent.toolCtx;
Expand Down Expand Up @@ -730,9 +726,11 @@ export class AgentActivity implements RecognitionHooks {
updateOptions({
toolChoice,
turnDetection,
endpointing,
}: {
toolChoice?: ToolChoice | null;
turnDetection?: TurnDetectionMode;
endpointing?: EndpointingOptions;
}): void {
if (toolChoice !== undefined) {
this.toolChoice = toolChoice;
Expand All @@ -755,7 +753,11 @@ export class AgentActivity implements RecognitionHooks {
}

if (this.audioRecognition) {
this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode });
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 487-493 lines
this.audioRecognition.updateOptions({
endpointing: endpointing !== undefined ? createEndpointing(endpointing) : undefined,
turnDetection: this.turnDetectionMode,
});
}
}

Expand Down Expand Up @@ -934,12 +936,9 @@ export class AgentActivity implements RecognitionHooks {

if (!this.vad) {
this.agentSession._updateUserState('speaking');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
this.audioRecognition.onStartOfOverlapSpeech(
0,
Date.now(),
this.agentSession._userSpeakingSpan,
);
if (this.audioRecognition) {
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1510-1517 lines
this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan);
}
}

Expand All @@ -959,8 +958,9 @@ export class AgentActivity implements RecognitionHooks {
this.logger.info(ev, 'onInputSpeechStopped');

if (!this.vad) {
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan);
if (this.audioRecognition) {
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1528-1535 lines
this.audioRecognition.onEndOfSpeech(Date.now(), this.agentSession._userSpeakingSpan);
}
this.agentSession._updateUserState('listening');
}
Expand Down Expand Up @@ -1042,14 +1042,17 @@ export class AgentActivity implements RecognitionHooks {
lastSpeakingTime: speechStartTime,
otelContext: otelContext.active(),
});
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
if (this.audioRecognition) {
// Pass speechStartTime as the absolute startedAt timestamp.
this.audioRecognition.onStartOfOverlapSpeech(
ev.speechDuration,
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1662-1673 lines
this.audioRecognition.onStartOfSpeech(
speechStartTime,
ev.speechDuration,
this.agentSession._userSpeakingSpan,
);
}
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1674-1676 lines
this.interruptionDetected = false;

if (this.falseInterruptionTimer) {
// cancel the timer when user starts speaking but leave the paused state unchanged
Expand Down Expand Up @@ -1080,11 +1083,13 @@ export class AgentActivity implements RecognitionHooks {
// Subtract both silenceDuration and inferenceDuration to correct for VAD model latency.
speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration;
}
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
if (this.audioRecognition) {
// Pass speechEndTime as the absolute endedAt timestamp.
this.audioRecognition.onEndOfOverlapSpeech(
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1698-1712 lines
this.audioRecognition.onEndOfSpeech(
speechEndTime,
this.agentSession._userSpeakingSpan,
this.isInterruptionDetectionEnabled ? this.interruptionDetected : undefined,
);
}
this.agentSession._updateUserState('listening', {
Expand Down Expand Up @@ -1167,15 +1172,12 @@ export class AgentActivity implements RecognitionHooks {
const audioOutput = this.agentSession.output.audio;

if (
this.isInterruptionDetectionEnabled &&
this.audioRecognition &&
!this.audioRecognition.endpointingOverlapping &&
this.agentSession.agentState === 'speaking'
) {
this.audioRecognition.onStartOfOverlapSpeech(
0,
Date.now(),
this.agentSession._userSpeakingSpan,
);
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 1631-1639 lines
this.audioRecognition.onStartOfSpeech(Date.now(), 0, this.agentSession._userSpeakingSpan);
}

this.updatePausedSpeech(this._currentSpeech, timeout);
Expand Down Expand Up @@ -1943,8 +1945,11 @@ export class AgentActivity implements RecognitionHooks {
startTime: startedSpeakingAt,
otelContext: speechHandle._agentTurnContext,
});
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech();
if (this.audioRecognition) {
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2223-2231 lines
this.audioRecognition.onStartOfAgentSpeech(replyStartedSpeakingAt);
}
if (this.isInterruptionDetectionEnabled) {
this.isInterruptionByAudioActivityEnabled = false;
}
};
Expand Down Expand Up @@ -2036,10 +2041,12 @@ export class AgentActivity implements RecognitionHooks {

if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
this.restoreInterruptionByAudioActivity();
if (this.isInterruptionDetectionEnabled) {
this.restoreInterruptionByAudioActivity();
}
}
}

Expand Down Expand Up @@ -2227,8 +2234,11 @@ export class AgentActivity implements RecognitionHooks {
startTime: startedSpeakingAt,
otelContext: speechHandle._agentTurnContext,
});
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech();
if (this.audioRecognition) {
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2579-2588 lines
this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt);
}
if (this.isInterruptionDetectionEnabled) {
this.isInterruptionByAudioActivityEnabled = false;
}
};
Expand Down Expand Up @@ -2390,8 +2400,10 @@ export class AgentActivity implements RecognitionHooks {

if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
if (this.isInterruptionDetectionEnabled) {
this.restoreInterruptionByAudioActivity();
}
}
Expand Down Expand Up @@ -2433,11 +2445,11 @@ export class AgentActivity implements RecognitionHooks {
this.agentSession._updateAgentState('thinking');
} else if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
{
this.audioRecognition.onEndOfAgentSpeech(Date.now());
this.restoreInterruptionByAudioActivity();
}
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
if (this.isInterruptionDetectionEnabled) {
this.restoreInterruptionByAudioActivity();
}
}

Expand Down Expand Up @@ -3390,7 +3402,8 @@ export class AgentActivity implements RecognitionHooks {
otelContext: this.pausedSpeech.handle._agentTurnContext,
});
if (this.audioRecognition && this.pausedSpeech.agentState === 'speaking') {
this.audioRecognition.onStartOfAgentSpeech();
// Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 3479-3486 lines
this.audioRecognition.onStartOfAgentSpeech(Date.now());
}
if (this.isInterruptionDetectionEnabled) {
this.isInterruptionByAudioActivityEnabled = false;
Expand Down
Loading
Loading