From 4e31b276db41198ce560b5b33399498aa5373dd3 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 16 Jan 2026 15:33:20 +0100 Subject: [PATCH 01/26] Add turn config interfaces and defaults (#975) --- agents/src/voice/agent_session.ts | 1 - agents/src/voice/turn_config/endpointing.ts | 23 +++ agents/src/voice/turn_config/interruption.ts | 46 +++++ agents/src/voice/turn_config/turnHandling.ts | 54 ++++++ agents/src/voice/turn_config/utils.test.ts | 166 +++++++++++++++++++ agents/src/voice/turn_config/utils.ts | 43 +++++ 6 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 agents/src/voice/turn_config/endpointing.ts create mode 100644 agents/src/voice/turn_config/interruption.ts create mode 100644 agents/src/voice/turn_config/turnHandling.ts create mode 100644 agents/src/voice/turn_config/utils.test.ts create mode 100644 agents/src/voice/turn_config/utils.ts diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index ad349a122..97e3e7e42 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -180,7 +180,6 @@ export class AgentSession< voiceOptions = defaultVoiceOptions, connOptions, } = opts; - // Merge user-provided connOptions with defaults this._connOptions = { sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions }, diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts new file mode 100644 index 000000000..28873acd3 --- /dev/null +++ b/agents/src/voice/turn_config/endpointing.ts @@ -0,0 +1,23 @@ +/** + * Configuration for endpointing, which determines when the user's turn is complete. + */ +export interface EndpointingConfig { + /** + * Minimum time in seconds since the last detected speech before the agent declares the user's + * turn complete. In VAD mode this effectively behaves like `max(VAD silence, minDelay)`; + * in STT mode it is applied after the STT end-of-speech signal, so it can be additive with + * the STT provider's endpointing delay. + * @defaultValue 0.5 + */ + minDelay?: number; + /** + * Maximum time in seconds the agent will wait before terminating the turn. + * @defaultValue 3.0 + */ + maxDelay?: number; +} + +export const defaultEndpointingConfig = { + minDelay: 0.5, + maxDelay: 3.0, +} as const satisfies EndpointingConfig; diff --git a/agents/src/voice/turn_config/interruption.ts b/agents/src/voice/turn_config/interruption.ts new file mode 100644 index 000000000..813fd191a --- /dev/null +++ b/agents/src/voice/turn_config/interruption.ts @@ -0,0 +1,46 @@ +/** + * Configuration for interruption handling. + */ +export interface InterruptionConfig { + /** + * Interruption handling strategy. + * @defaultValue undefined + */ + mode?: 'adaptive' | 'vad' | false; + /** + * When `true`, buffered audio is dropped while the agent is speaking and cannot be interrupted. + * @defaultValue true + */ + discardAudioIfUninterruptible?: boolean; + /** + * Minimum speech length in seconds to register as an interruption. + * @defaultValue 0.5 + */ + minDuration?: number; + /** + * Minimum number of words to consider an interruption, only used if STT is enabled. + * @defaultValue 0 + */ + minWords?: number; + /** + * If set, emit an `agentFalseInterruption` event after this amount of time if the user is + * silent and no user transcript is detected after the interruption. Set to `undefined` to + * disable. + * @defaultValue 2.0 + */ + falseInterruptionTimeout?: number; + /** + * Whether to resume the false interruption after the `falseInterruptionTimeout`. + * @defaultValue true + */ + resumeFalseInterruption?: boolean; +} + +export const defaultInterruptionConfig = { + mode: undefined, + discardAudioIfUninterruptible: true, + minDuration: 0.5, + minWords: 0, + falseInterruptionTimeout: 2, + resumeFalseInterruption: true, +} as const satisfies InterruptionConfig; diff --git a/agents/src/voice/turn_config/turnHandling.ts b/agents/src/voice/turn_config/turnHandling.ts new file mode 100644 index 000000000..6baa05444 --- /dev/null +++ b/agents/src/voice/turn_config/turnHandling.ts @@ -0,0 +1,54 @@ +import type { TurnDetectionMode } from '../agent_session.js'; +import { type EndpointingConfig, defaultEndpointingConfig } from './endpointing.js'; +import { type InterruptionConfig, defaultInterruptionConfig } from './interruption.js'; + +/** + * Configuration for the turn handling system. Used to configure the turn taking behavior of the + * session. + */ +export interface TurnHandlingConfig { + /** + * Strategy for deciding when the user has finished speaking. + * + * - `"stt"` – rely on speech-to-text end-of-utterance cues + * - `"vad"` – rely on Voice Activity Detection start/stop cues + * - `"realtime_llm"` – use server-side detection from a realtime LLM + * - `"manual"` – caller controls turn boundaries explicitly + * + * If not set, the session chooses the best available mode in priority order + * `realtime_llm → vad → stt → manual`; it automatically falls back if the necessary model + * is missing. + */ + turnDetection: TurnDetectionMode | undefined; + /** + * Configuration for endpointing. + */ + endpointing: EndpointingConfig; + /** + * Configuration for interruption handling. + */ + interruption: InterruptionConfig; + /** + * If set, set the user state as "away" after this amount of time after user and agent are + * silent. Set to `undefined` to disable. + * @defaultValue 15.0 + */ + userAwayTimeout: number; + /** + * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected. + * When `true`, the agent sends inference calls as soon as a user transcript is received rather + * than waiting for a definitive turn boundary. This can reduce response latency by overlapping + * model inference with user audio, but may incur extra compute if the user interrupts or + * revises mid-utterance. + * @defaultValue false + */ + preemptiveGeneration: boolean; +} + +export const defaultTurnHandlingConfig: TurnHandlingConfig = { + turnDetection: undefined, + interruption: defaultInterruptionConfig, + endpointing: defaultEndpointingConfig, + userAwayTimeout: 15, + preemptiveGeneration: false, +}; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts new file mode 100644 index 000000000..20b9f9087 --- /dev/null +++ b/agents/src/voice/turn_config/utils.test.ts @@ -0,0 +1,166 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import type { AgentSessionOptions } from '../agent_session.js'; +import { + defaultEndpointingConfig, + defaultInterruptionConfig, + defaultTurnHandlingConfig, +} from './index.js'; +import { migrateLegacyOptions } from './utils.js'; + +describe('migrateLegacyOptions', () => { + it('should return default turn handling config when no legacy options provided', () => { + const input: AgentSessionOptions = {}; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling).toBeDefined(); + expect(result.turnHandling!.turnDetection).toBe(defaultTurnHandlingConfig.turnDetection); + expect(result.turnHandling!.userAwayTimeout).toBe(defaultTurnHandlingConfig.userAwayTimeout); + expect(result.turnHandling!.preemptiveGeneration).toBe( + defaultTurnHandlingConfig.preemptiveGeneration, + ); + expect(result.turnHandling!.interruption).toMatchObject(defaultInterruptionConfig); + expect(result.turnHandling!.endpointing).toMatchObject(defaultEndpointingConfig); + }); + + it('should migrate legacy turnDetection to turnHandling.turnDetection', () => { + const input: AgentSessionOptions = { + turnDetection: 'vad', + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.turnDetection).toBe('vad'); + expect('turnDetection' in result).toBe(false); + }); + + it('should set interruption.mode to false when allowInterruptions is false', () => { + const input: AgentSessionOptions = { + voiceOptions: { + allowInterruptions: false, + }, + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.interruption.mode).toBe(false); + expect('voiceOptions' in result).toBe(false); + }); + + it('should not set interruption.mode when allowInterruptions is true', () => { + const input: AgentSessionOptions = { + voiceOptions: { + allowInterruptions: true, + }, + }; + const result = migrateLegacyOptions(input); + + // mode should remain undefined (the default) when allowInterruptions is true + expect(result.turnHandling!.interruption.mode).toBe(defaultInterruptionConfig.mode); + }); + + it('should migrate voiceOptions interruption settings', () => { + const input: AgentSessionOptions = { + voiceOptions: { + minInterruptionDuration: 0.8, + minInterruptionWords: 3, + discardAudioIfUninterruptible: false, + }, + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.interruption.minDuration).toBe(0.8); + expect(result.turnHandling!.interruption.minWords).toBe(3); + expect(result.turnHandling!.interruption.discardAudioIfUninterruptible).toBe(false); + }); + + it('should migrate voiceOptions endpointing settings', () => { + const input: AgentSessionOptions = { + voiceOptions: { + minEndpointingDelay: 1.0, + maxEndpointingDelay: 5.0, + }, + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.endpointing.minDelay).toBe(1.0); + expect(result.turnHandling!.endpointing.maxDelay).toBe(5.0); + }); + + it('should migrate voiceOptions.preemptiveGeneration', () => { + const input: AgentSessionOptions = { + voiceOptions: { + preemptiveGeneration: true, + }, + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.preemptiveGeneration).toBe(true); + }); + + it('should migrate voiceOptions.userAwayTimeout', () => { + const input: AgentSessionOptions = { + voiceOptions: { + userAwayTimeout: 30.0, + }, + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.userAwayTimeout).toBe(30.0); + }); + + it('should migrate all legacy options together', () => { + const input: AgentSessionOptions = { + turnDetection: 'stt', + voiceOptions: { + allowInterruptions: false, + discardAudioIfUninterruptible: false, + minInterruptionDuration: 1.0, + minInterruptionWords: 2, + minEndpointingDelay: 0.8, + maxEndpointingDelay: 4.0, + preemptiveGeneration: true, + userAwayTimeout: 20.0, + }, + }; + const result = migrateLegacyOptions(input); + + expect(result.turnHandling!.turnDetection).toBe('stt'); + expect(result.turnHandling!.interruption.mode).toBe(false); + expect(result.turnHandling!.interruption.discardAudioIfUninterruptible).toBe(false); + expect(result.turnHandling!.interruption.minDuration).toBe(1.0); + expect(result.turnHandling!.interruption.minWords).toBe(2); + expect(result.turnHandling!.endpointing.minDelay).toBe(0.8); + expect(result.turnHandling!.endpointing.maxDelay).toBe(4.0); + expect(result.turnHandling!.preemptiveGeneration).toBe(true); + expect(result.turnHandling!.userAwayTimeout).toBe(20.0); + + // Legacy options should be stripped + expect('turnDetection' in result).toBe(false); + expect('voiceOptions' in result).toBe(false); + }); + + it('should preserve non-legacy options in the result', () => { + const input: AgentSessionOptions = { + turnDetection: 'vad', + voiceOptions: { + minEndpointingDelay: 1.0, + }, + maxToolSteps: 5, + connOptions: { + maxUnrecoverableErrors: 10, + }, + }; + const result = migrateLegacyOptions(input); + + // Non-legacy options should be preserved + expect(result.maxToolSteps).toBe(5); + expect(result.connOptions).toEqual({ maxUnrecoverableErrors: 10 }); + + // Legacy options should be stripped and migrated + expect('turnDetection' in result).toBe(false); + expect('voiceOptions' in result).toBe(false); + expect(result.turnHandling!.turnDetection).toBe('vad'); + expect(result.turnHandling!.endpointing.minDelay).toBe(1.0); + }); +}); diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts new file mode 100644 index 000000000..a4773df1a --- /dev/null +++ b/agents/src/voice/turn_config/utils.ts @@ -0,0 +1,43 @@ +import type { AgentSessionOptions } from '../agent_session.js'; +import { + type TurnHandlingConfig, + defaultEndpointingConfig, + defaultInterruptionConfig, + defaultTurnHandlingConfig, +} from './index.js'; + +export function migrateLegacyOptions( + legacyOptions: AgentSessionOptions, +): Omit { + const { voiceOptions, turnDetection, ...rest } = legacyOptions; + const newAgentSessionOptions = rest; + const turnHandling: TurnHandlingConfig = { + turnDetection: turnDetection ?? defaultTurnHandlingConfig.turnDetection, + interruption: { + ...defaultInterruptionConfig, + discardAudioIfUninterruptible: + voiceOptions?.discardAudioIfUninterruptible ?? + defaultInterruptionConfig.discardAudioIfUninterruptible, + minDuration: voiceOptions?.minInterruptionDuration ?? defaultInterruptionConfig.minDuration, + minWords: voiceOptions?.minInterruptionWords ?? defaultInterruptionConfig.minWords, + }, + endpointing: { + ...defaultEndpointingConfig, + minDelay: voiceOptions?.minEndpointingDelay ?? defaultEndpointingConfig.minDelay, + maxDelay: voiceOptions?.maxEndpointingDelay ?? defaultEndpointingConfig.maxDelay, + }, + userAwayTimeout: voiceOptions?.userAwayTimeout ?? defaultTurnHandlingConfig.userAwayTimeout, + preemptiveGeneration: + voiceOptions?.preemptiveGeneration ?? defaultTurnHandlingConfig.preemptiveGeneration, + }; + + if (voiceOptions?.allowInterruptions === false) { + turnHandling.interruption.mode = false; + } + + newAgentSessionOptions.turnHandling = turnHandling; + if (voiceOptions?.maxToolSteps) { + newAgentSessionOptions.maxToolSteps = voiceOptions.maxToolSteps; + } + return newAgentSessionOptions; +} From 07c5d710f895db9cc6f7c2e1c2932579c061ad67 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 27 Jan 2026 11:01:37 +0100 Subject: [PATCH 02/26] Add AdaptiveInterruptionDetector (#980) Co-authored-by: Brian Yin <57741529+Toubat@users.noreply.github.com> --- .changeset/config.json | 8 +- agents/package.json | 1 + agents/src/index.ts | 2 + .../AdaptiveInterruptionDetector.ts | 192 ++++++++ .../interruption/InterruptionCacheEntry.ts | 47 ++ .../interruption/InterruptionStream.ts | 423 ++++++++++++++++++ agents/src/inference/interruption/defaults.ts | 52 +++ agents/src/inference/interruption/errors.ts | 25 ++ .../inference/interruption/http_transport.ts | 182 ++++++++ agents/src/inference/interruption/types.ts | 89 ++++ .../src/inference/interruption/utils.test.ts | 31 ++ agents/src/inference/interruption/utils.ts | 140 ++++++ .../interruption/ws_transport.test.ts | 243 ++++++++++ .../inference/interruption/ws_transport.ts | 387 ++++++++++++++++ agents/src/stream/stream_channel.ts | 9 +- agents/src/telemetry/trace_types.ts | 7 + pnpm-lock.yaml | 27 ++ 17 files changed, 1856 insertions(+), 9 deletions(-) create mode 100644 agents/src/inference/interruption/AdaptiveInterruptionDetector.ts create mode 100644 agents/src/inference/interruption/InterruptionCacheEntry.ts create mode 100644 agents/src/inference/interruption/InterruptionStream.ts create mode 100644 agents/src/inference/interruption/defaults.ts create mode 100644 agents/src/inference/interruption/errors.ts create mode 100644 agents/src/inference/interruption/http_transport.ts create mode 100644 agents/src/inference/interruption/types.ts create mode 100644 agents/src/inference/interruption/utils.test.ts create mode 100644 agents/src/inference/interruption/utils.ts create mode 100644 agents/src/inference/interruption/ws_transport.test.ts create mode 100644 agents/src/inference/interruption/ws_transport.ts diff --git a/.changeset/config.json b/.changeset/config.json index af66336b2..29b38eb85 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -8,13 +8,7 @@ ], "commit": false, "ignore": ["livekit-agents-examples"], - "fixed": [ - [ - "@livekit/agents", - "@livekit/agents-plugin-*", - "@livekit/agents-plugins-test" - ] - ], + "fixed": [["@livekit/agents", "@livekit/agents-plugin-*", "@livekit/agents-plugins-test"]], "access": "public", "baseBranch": "main", "updateInternalDependencies": "patch", diff --git a/agents/package.json b/agents/package.json index be6c82854..054c11881 100644 --- a/agents/package.json +++ b/agents/package.json @@ -69,6 +69,7 @@ "heap-js": "^2.6.0", "json-schema": "^0.4.0", "livekit-server-sdk": "^2.14.1", + "ofetch": "^1.5.1", "openai": "^6.8.1", "pidusage": "^4.0.1", "pino": "^8.19.0", diff --git a/agents/src/index.ts b/agents/src/index.ts index 57ace0c7a..e4fd2859b 100644 --- a/agents/src/index.ts +++ b/agents/src/index.ts @@ -36,4 +36,6 @@ export * from './vad.js'; export * from './version.js'; export * from './worker.js'; +export * from './inference/interruption/index.js'; + export { cli, inference, ipc, llm, metrics, stream, stt, telemetry, tokenize, tts, voice }; diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts new file mode 100644 index 000000000..eb27a2482 --- /dev/null +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -0,0 +1,192 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { TypedEventEmitter } from '@livekit/typed-emitter'; +import EventEmitter from 'events'; +import { log } from '../../log.js'; +import { InterruptionStreamBase } from './InterruptionStream.js'; +import { + DEFAULT_BASE_URL, + FRAMES_PER_SECOND, + SAMPLE_RATE, + interruptionOptionDefaults, +} from './defaults.js'; +import type { InterruptionDetectionError } from './errors.js'; +import type { InterruptionEvent, InterruptionOptions } from './types.js'; + +type InterruptionCallbacks = { + userInterruptionDetected: (event: InterruptionEvent) => void; + userNonInterruptionDetected: (event: InterruptionEvent) => void; + overlapSpeechEnded: (event: InterruptionEvent) => void; + error: (error: InterruptionDetectionError) => void; +}; + +export type AdaptiveInterruptionDetectorOptions = Omit, 'useProxy'>; + +export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { + options: InterruptionOptions; + private readonly _label: string; + private logger = log(); + // Use Set instead of WeakSet to allow iteration for propagating option updates + private streams: Set = new Set(); + + constructor(options: AdaptiveInterruptionDetectorOptions = {}) { + super(); + + const { + maxAudioDurationInS, + baseUrl, + apiKey, + apiSecret, + audioPrefixDurationInS, + threshold, + detectionIntervalInS, + inferenceTimeout, + minInterruptionDurationInS, + } = { ...interruptionOptionDefaults, ...options }; + + if (maxAudioDurationInS > 3.0) { + throw new Error('maxAudioDurationInS must be less than or equal to 3.0 seconds'); + } + + const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? DEFAULT_BASE_URL; + let lkApiKey = apiKey ?? ''; + let lkApiSecret = apiSecret ?? ''; + let useProxy: boolean; + + // use LiveKit credentials if using the default base URL (inference) + if (lkBaseUrl === DEFAULT_BASE_URL) { + lkApiKey = + apiKey ?? process.env.LIVEKIT_INFERENCE_API_KEY ?? process.env.LIVEKIT_API_KEY ?? ''; + if (!lkApiKey) { + throw new Error( + 'apiKey is required, either as argument or set LIVEKIT_API_KEY environmental variable', + ); + } + + lkApiSecret = + apiSecret ?? + process.env.LIVEKIT_INFERENCE_API_SECRET ?? + process.env.LIVEKIT_API_SECRET ?? + ''; + if (!lkApiSecret) { + throw new Error( + 'apiSecret is required, either as argument or set LIVEKIT_API_SECRET environmental variable', + ); + } + + useProxy = true; + } else { + // Force useProxy to false for custom URLs (matching Python behavior) + useProxy = false; + } + + this.options = { + sampleRate: SAMPLE_RATE, + threshold, + minFrames: Math.ceil(minInterruptionDurationInS * FRAMES_PER_SECOND), + maxAudioDurationInS, + audioPrefixDurationInS, + detectionIntervalInS, + inferenceTimeout, + baseUrl: lkBaseUrl, + apiKey: lkApiKey, + apiSecret: lkApiSecret, + useProxy, + minInterruptionDurationInS, + }; + + this._label = `${this.constructor.name}`; + + this.logger.debug( + { + baseUrl: this.options.baseUrl, + detectionIntervalInS: this.options.detectionIntervalInS, + audioPrefixDurationInS: this.options.audioPrefixDurationInS, + maxAudioDurationInS: this.options.maxAudioDurationInS, + minFrames: this.options.minFrames, + threshold: this.options.threshold, + inferenceTimeout: this.options.inferenceTimeout, + useProxy: this.options.useProxy, + }, + 'adaptive interruption detector initialized', + ); + } + + /** + * The model identifier for this detector. + */ + get model(): string { + return 'adaptive interruption'; + } + + /** + * The provider identifier for this detector. + */ + get provider(): string { + return 'livekit'; + } + + /** + * The label for this detector instance. + */ + get label(): string { + return this._label; + } + + /** + * The sample rate used for audio processing. + */ + get sampleRate(): number { + return this.options.sampleRate; + } + + /** + * Emit an error event from the detector. + */ + emitError(error: InterruptionDetectionError): void { + this.emit('error', error); + } + + /** + * Creates a new InterruptionStreamBase for internal use. + * The stream can receive audio frames and sentinels via pushFrame(). + * Use this when you need direct access to the stream for pushing frames. + */ + createStream(): InterruptionStreamBase { + const streamBase = new InterruptionStreamBase(this, {}); + this.streams.add(streamBase); + return streamBase; + } + + /** + * Remove a stream from tracking (called when stream is closed). + */ + removeStream(stream: InterruptionStreamBase): void { + this.streams.delete(stream); + } + + /** + * Update options for the detector and propagate to all active streams. + * For WebSocket streams, this triggers a reconnection with new settings. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + } + + // Propagate option updates to all active streams (matching Python behavior) + const updatePromises: Promise[] = []; + for (const stream of this.streams) { + updatePromises.push(stream.updateOptions(options)); + } + await Promise.all(updatePromises); + } +} diff --git a/agents/src/inference/interruption/InterruptionCacheEntry.ts b/agents/src/inference/interruption/InterruptionCacheEntry.ts new file mode 100644 index 000000000..e6da964d8 --- /dev/null +++ b/agents/src/inference/interruption/InterruptionCacheEntry.ts @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { estimateProbability } from './utils.js'; + +/** + * Typed cache entry for interruption inference results. + * Mutable to support setOrUpdate pattern from Python's _BoundedCache. + */ +export class InterruptionCacheEntry { + createdAt: number; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + speechInput?: Int16Array; + probabilities?: number[]; + isInterruption?: boolean; + + constructor(params: { + createdAt: number; + speechInput?: Int16Array; + totalDurationInS?: number; + predictionDurationInS?: number; + detectionDelayInS?: number; + probabilities?: number[]; + isInterruption?: boolean; + }) { + this.createdAt = params.createdAt; + this.totalDurationInS = params.totalDurationInS ?? 0; + this.predictionDurationInS = params.predictionDurationInS ?? 0; + this.detectionDelayInS = params.detectionDelayInS ?? 0; + this.speechInput = params.speechInput; + this.probabilities = params.probabilities; + this.isInterruption = params.isInterruption; + } + + /** + * The conservative estimated probability of the interruption event. + */ + get probability(): number { + return this.probabilities ? estimateProbability(this.probabilities) : 0; + } + + static default(): InterruptionCacheEntry { + return new InterruptionCacheEntry({ createdAt: 0 }); + } +} diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts new file mode 100644 index 000000000..bdd9b178c --- /dev/null +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -0,0 +1,423 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; +import type { Span } from '@opentelemetry/api'; +import { type ReadableStream, TransformStream } from 'stream/web'; +import { log } from '../../log.js'; +import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { traceTypes } from '../../telemetry/index.js'; +import type { AdaptiveInterruptionDetector } from './AdaptiveInterruptionDetector.js'; +import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; +import { FRAMES_PER_SECOND, apiConnectDefaults } from './defaults.js'; +import type { InterruptionDetectionError } from './errors.js'; +import { createHttpTransport } from './http_transport.js'; +import { + type AgentSpeechEnded, + type AgentSpeechStarted, + type ApiConnectOptions, + type Flush, + type InterruptionEvent, + InterruptionEventType, + type InterruptionOptions, + type InterruptionSentinel, + type OverlapSpeechEnded, + type OverlapSpeechStarted, +} from './types.js'; +import { BoundedCache } from './utils.js'; +import { createWsTransport } from './ws_transport.js'; + +// Re-export sentinel types for backwards compatibility +export type { + AgentSpeechEnded, + AgentSpeechStarted, + ApiConnectOptions, + Flush, + InterruptionSentinel, + OverlapSpeechEnded, + OverlapSpeechStarted, +}; + +export class InterruptionStreamSentinel { + static speechStarted(): AgentSpeechStarted { + return { type: 'agent-speech-started' }; + } + + static speechEnded(): AgentSpeechEnded { + return { type: 'agent-speech-ended' }; + } + + static overlapSpeechStarted( + speechDurationInS: number, + userSpeakingSpan: Span, + ): OverlapSpeechStarted { + return { type: 'overlap-speech-started', speechDurationInS, userSpeakingSpan }; + } + + static overlapSpeechEnded(): OverlapSpeechEnded { + return { type: 'overlap-speech-ended' }; + } + + static flush(): Flush { + return { type: 'flush' }; + } +} + +function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { + span.setAttribute( + traceTypes.ATTR_IS_INTERRUPTION, + (entry.isInterruption ?? false).toString().toLowerCase(), + ); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PROBABILITY, entry.probability); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelayInS); +} + +export class InterruptionStreamBase { + private inputStream: StreamChannel; + + private eventStream: ReadableStream; + + private resampler?: AudioResampler; + + private userSpeakingSpan: Span | undefined; + + private overlapSpeechStartedAt: number | undefined; + + private options: InterruptionOptions; + + private apiOptions: ApiConnectOptions; + + private model: AdaptiveInterruptionDetector; + + private logger = log(); + + // Store reconnect function for WebSocket transport + private wsReconnect?: () => Promise; + + // Mutable transport options that can be updated via updateOptions() + private transportOptions: { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; + maxRetries: number; + }; + + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { + this.inputStream = createStreamChannel< + InterruptionSentinel | AudioFrame, + InterruptionDetectionError + >(); + + this.model = model; + this.options = { ...model.options }; + this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; + + // Initialize mutable transport options + this.transportOptions = { + baseUrl: this.options.baseUrl, + apiKey: this.options.apiKey, + apiSecret: this.options.apiSecret, + sampleRate: this.options.sampleRate, + threshold: this.options.threshold, + minFrames: this.options.minFrames, + timeout: this.options.inferenceTimeout, + maxRetries: this.apiOptions.maxRetries, + }; + + this.eventStream = this.setupTransform(); + } + + /** + * Update stream options. For WebSocket transport, this triggers a reconnection. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + this.transportOptions.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + this.transportOptions.minFrames = this.options.minFrames; + } + // Trigger WebSocket reconnection if using proxy (WebSocket transport) + if (this.options.useProxy && this.wsReconnect) { + await this.wsReconnect(); + } + } + + private setupTransform(): ReadableStream { + let agentSpeechStarted = false; + let startIdx = 0; + let accumulatedSamples = 0; + let overlapSpeechStarted = false; + // Use BoundedCache with max_len=10 to prevent unbounded memory growth + const cache = new BoundedCache(10); + const inferenceS16Data = new Int16Array( + Math.ceil(this.options.maxAudioDurationInS * this.options.sampleRate), + ).fill(0); + + // State accessors for transport + const getState = () => ({ + overlapSpeechStarted, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + cache, + }); + const setState = (partial: { overlapSpeechStarted?: boolean }) => { + if (partial.overlapSpeechStarted !== undefined) { + overlapSpeechStarted = partial.overlapSpeechStarted; + } + }; + const handleSpanUpdate = (entry: InterruptionCacheEntry) => { + if (this.userSpeakingSpan) { + updateUserSpeakingSpan(this.userSpeakingSpan, entry); + this.userSpeakingSpan = undefined; + } + }; + + // First transform: process input frames/sentinels and output audio slices or events + const audioTransformer = new TransformStream< + InterruptionSentinel | AudioFrame, + Int16Array | InterruptionEvent + >( + { + transform: (chunk, controller) => { + if (chunk instanceof AudioFrame) { + if (!agentSpeechStarted) { + return; + } + if (this.options.sampleRate !== chunk.sampleRate) { + controller.error('the sample rate of the input frames must be consistent'); + return; + } + const result = writeToInferenceS16Data( + chunk, + startIdx, + inferenceS16Data, + this.options.maxAudioDurationInS, + ); + startIdx = result.startIdx; + accumulatedSamples += result.samplesWritten; + + // Send data for inference when enough samples accumulated during overlap + if ( + accumulatedSamples >= + Math.floor(this.options.detectionIntervalInS * this.options.sampleRate) && + overlapSpeechStarted + ) { + // Send a copy of the audio data up to startIdx for inference + const audioSlice = inferenceS16Data.slice(0, startIdx); + accumulatedSamples = 0; + controller.enqueue(audioSlice); + } + } else if (chunk.type === 'agent-speech-started') { + this.logger.debug('agent speech started'); + agentSpeechStarted = true; + overlapSpeechStarted = false; + accumulatedSamples = 0; + startIdx = 0; + cache.clear(); + } else if (chunk.type === 'agent-speech-ended') { + this.logger.debug('agent speech ended'); + agentSpeechStarted = false; + overlapSpeechStarted = false; + accumulatedSamples = 0; + startIdx = 0; + cache.clear(); + } else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) { + this.userSpeakingSpan = chunk.userSpeakingSpan; + this.logger.debug('overlap speech started, starting interruption inference'); + overlapSpeechStarted = true; + accumulatedSamples = 0; + // Include both speech duration and audio prefix duration for context + const shiftSize = Math.min( + startIdx, + Math.round(chunk.speechDurationInS * this.options.sampleRate) + + Math.round(this.options.audioPrefixDurationInS * this.options.sampleRate), + ); + // Shift the buffer: copy the last `shiftSize` samples before startIdx + // to the beginning of the buffer. This preserves recent audio context + // (the user's speech that occurred just before overlap was detected). + inferenceS16Data.copyWithin(0, startIdx - shiftSize, startIdx); + startIdx = shiftSize; + cache.clear(); + } else if (chunk.type === 'overlap-speech-ended') { + this.logger.debug('overlap speech ended'); + if (overlapSpeechStarted) { + this.userSpeakingSpan = undefined; + // Use pop with predicate to get only completed requests (matching Python behavior) + // This ensures we don't return incomplete/in-flight requests as the "final" result + let latestEntry = cache.pop( + (entry) => entry.totalDurationInS !== undefined && entry.totalDurationInS > 0, + ); + if (!latestEntry) { + this.logger.debug('no request made for overlap speech'); + latestEntry = InterruptionCacheEntry.default(); + } + const event: InterruptionEvent = { + type: InterruptionEventType.OVERLAP_SPEECH_ENDED, + timestamp: Date.now(), + isInterruption: false, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + speechInput: latestEntry.speechInput, + probabilities: latestEntry.probabilities, + totalDurationInS: latestEntry.totalDurationInS, + detectionDelayInS: latestEntry.detectionDelayInS, + predictionDurationInS: latestEntry.predictionDurationInS, + probability: latestEntry.probability, + }; + controller.enqueue(event); + overlapSpeechStarted = false; + } + } else if (chunk.type === 'flush') { + // no-op + } + }, + }, + { highWaterMark: 32 }, + { highWaterMark: 32 }, + ); + + // Second transform: transport layer (HTTP or WebSocket based on useProxy) + const transportOptions = this.transportOptions; + + let transport: TransformStream; + if (this.options.useProxy) { + const wsResult = createWsTransport(transportOptions, getState, setState, handleSpanUpdate); + transport = wsResult.transport; + this.wsReconnect = wsResult.reconnect; + } else { + transport = createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); + } + + const eventEmitter = new TransformStream({ + transform: (chunk, controller) => { + if (chunk.type === InterruptionEventType.INTERRUPTION) { + this.model.emit('userInterruptionDetected', chunk); + } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { + this.model.emit('overlapSpeechEnded', chunk); + } + controller.enqueue(chunk); + }, + }); + + // Pipeline: input -> audioTransformer -> transport -> eventStream + return this.inputStream + .stream() + .pipeThrough(audioTransformer) + .pipeThrough(transport) + .pipeThrough(eventEmitter); + } + + private ensureInputNotEnded() { + if (this.inputStream.closed) { + throw new Error('input stream is closed'); + } + } + + private ensureStreamsNotEnded() { + this.ensureInputNotEnded(); + } + + private getResamplerFor(inputSampleRate: number): AudioResampler { + if (!this.resampler) { + this.resampler = new AudioResampler(inputSampleRate, this.options.sampleRate); + } + return this.resampler; + } + + stream(): ReadableStream { + return this.eventStream; + } + + async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise { + this.ensureStreamsNotEnded(); + if (!(frame instanceof AudioFrame)) { + if (frame.type === 'overlap-speech-started') { + this.overlapSpeechStartedAt = Date.now() - frame.speechDurationInS * 1000; + } + return this.inputStream.write(frame); + } else if (this.options.sampleRate !== frame.sampleRate) { + const resampler = this.getResamplerFor(frame.sampleRate); + if (resampler.inputRate !== frame.sampleRate) { + throw new Error('the sample rate of the input frames must be consistent'); + } + for (const resampledFrame of resampler.push(frame)) { + await this.inputStream.write(resampledFrame); + } + } else { + await this.inputStream.write(frame); + } + } + + async flush(): Promise { + this.ensureStreamsNotEnded(); + await this.inputStream.write(InterruptionStreamSentinel.flush()); + } + + async endInput(): Promise { + await this.flush(); + await this.inputStream.close(); + } + + async close(): Promise { + if (!this.inputStream.closed) await this.inputStream.close(); + } +} + +/** + * Write the audio frame to the output data array and return the new start index + * and the number of samples written. + */ +function writeToInferenceS16Data( + frame: AudioFrame, + startIdx: number, + outData: Int16Array, + maxAudioDuration: number, +): { startIdx: number; samplesWritten: number } { + const maxWindowSize = Math.floor(maxAudioDuration * frame.sampleRate); + + if (frame.samplesPerChannel > outData.length) { + throw new Error('frame samples are greater than the max window size'); + } + + // Shift the data to the left if the window would overflow + const shift = startIdx + frame.samplesPerChannel - maxWindowSize; + if (shift > 0) { + outData.copyWithin(0, shift, startIdx); + startIdx -= shift; + } + + // Get the frame data as Int16Array + const frameData = new Int16Array( + frame.data.buffer, + frame.data.byteOffset, + frame.samplesPerChannel * frame.channels, + ); + + if (frame.channels > 1) { + // Mix down multiple channels to mono by averaging + for (let i = 0; i < frame.samplesPerChannel; i++) { + let sum = 0; + for (let ch = 0; ch < frame.channels; ch++) { + sum += frameData[i * frame.channels + ch] ?? 0; + } + outData[startIdx + i] = Math.floor(sum / frame.channels); + } + } else { + // Single channel - copy directly + outData.set(frameData, startIdx); + } + + startIdx += frame.samplesPerChannel; + return { startIdx, samplesWritten: frame.samplesPerChannel }; +} diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts new file mode 100644 index 000000000..cd7988f6a --- /dev/null +++ b/agents/src/inference/interruption/defaults.ts @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { ApiConnectOptions } from './InterruptionStream.js'; +import type { InterruptionOptions } from './types.js'; + +export const MIN_INTERRUPTION_DURATION_IN_S = 0.025 * 2; // 25ms per frame, 2 consecutive frames +export const THRESHOLD = 0.65; +export const MAX_AUDIO_DURATION_IN_S = 3.0; +export const AUDIO_PREFIX_DURATION_IN_S = 0.5; +export const DETECTION_INTERVAL_IN_S = 0.1; +export const REMOTE_INFERENCE_TIMEOUT_IN_S = 1.0; +export const SAMPLE_RATE = 16000; +export const FRAMES_PER_SECOND = 40; +export const FRAME_DURATION_IN_S = 0.025; // 25ms per frame +export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; + +export const apiConnectDefaults: ApiConnectOptions = { + maxRetries: 3, + retryInterval: 2_000, + timeout: 10_000, +} as const; + +/** + * Calculate the retry interval using exponential backoff with jitter. + * Matches the Python implementation's _interval_for_retry behavior. + */ +export function intervalForRetry( + attempt: number, + baseInterval: number = apiConnectDefaults.retryInterval, +): number { + // Exponential backoff: baseInterval * 2^attempt with some jitter + const exponentialDelay = baseInterval * Math.pow(2, attempt); + // Add jitter (0-25% of the delay) + const jitter = exponentialDelay * Math.random() * 0.25; + return exponentialDelay + jitter; +} + +export const interruptionOptionDefaults: InterruptionOptions = { + sampleRate: SAMPLE_RATE, + threshold: THRESHOLD, + minFrames: Math.ceil(MIN_INTERRUPTION_DURATION_IN_S * FRAMES_PER_SECOND), + maxAudioDurationInS: MAX_AUDIO_DURATION_IN_S, + audioPrefixDurationInS: AUDIO_PREFIX_DURATION_IN_S, + detectionIntervalInS: DETECTION_INTERVAL_IN_S, + inferenceTimeout: 10_000, + baseUrl: DEFAULT_BASE_URL, + apiKey: process.env.LIVEKIT_API_KEY || '', + apiSecret: process.env.LIVEKIT_API_SECRET || '', + useProxy: false, + minInterruptionDurationInS: MIN_INTERRUPTION_DURATION_IN_S, +} as const; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts new file mode 100644 index 000000000..a346b7d28 --- /dev/null +++ b/agents/src/inference/interruption/errors.ts @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +/** + * Error thrown during interruption detection. + */ +export class InterruptionDetectionError extends Error { + readonly type = 'InterruptionDetectionError'; + + readonly timestamp: number; + readonly label: string; + readonly recoverable: boolean; + + constructor(message: string, timestamp: number, label: string, recoverable: boolean) { + super(message); + this.name = 'InterruptionDetectionError'; + this.timestamp = timestamp; + this.label = label; + this.recoverable = recoverable; + } + + toString(): string { + return `${this.name}: ${this.message} (label=${this.label}, timestamp=${this.timestamp}, recoverable=${this.recoverable})`; + } +} diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts new file mode 100644 index 000000000..25f8b7c25 --- /dev/null +++ b/agents/src/inference/interruption/http_transport.ts @@ -0,0 +1,182 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ofetch } from 'ofetch'; +import { TransformStream } from 'stream/web'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; +import { intervalForRetry } from './defaults.js'; +import { type InterruptionEvent, InterruptionEventType } from './types.js'; +import type { BoundedCache } from './utils.js'; + +export interface PostOptions { + baseUrl: string; + token: string; + signal?: AbortSignal; + timeout?: number; + maxRetries?: number; +} + +export interface PredictOptions { + threshold: number; + minFrames: number; +} + +export interface PredictEndpointResponse { + created_at: number; + is_bargein: boolean; + probabilities: number[]; +} + +export interface PredictResponse { + createdAt: number; + isBargein: boolean; + probabilities: number[]; + predictionDurationInS: number; +} + +export async function predictHTTP( + data: Int16Array, + predictOptions: PredictOptions, + options: PostOptions, +): Promise { + const createdAt = performance.now(); + const url = new URL(`/bargein`, options.baseUrl); + url.searchParams.append('threshold', predictOptions.threshold.toString()); + url.searchParams.append('min_frames', predictOptions.minFrames.toFixed()); + url.searchParams.append('created_at', createdAt.toFixed()); + + let retryCount = 0; + const { created_at, is_bargein, probabilities } = await ofetch( + url.toString(), + { + retry: options.maxRetries ?? 3, + retryDelay: () => { + const delay = intervalForRetry(retryCount); + retryCount++; + return delay; + }, + headers: { + 'Content-Type': 'application/octet-stream', + Authorization: `Bearer ${options.token}`, + }, + signal: options.signal, + timeout: options.timeout, + method: 'POST', + body: data, + }, + ); + + return { + createdAt: created_at, + isBargein: is_bargein, + probabilities, + predictionDurationInS: (performance.now() - createdAt) / 1000, + }; +} + +export interface HttpTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + threshold: number; + minFrames: number; + timeout: number; + maxRetries?: number; +} + +export interface HttpTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: BoundedCache; +} + +/** + * Creates an HTTP transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * Each audio slice triggers an HTTP POST request. + * + * @param options - Transport options object. This is read on each request, so mutations + * to threshold/minFrames will be picked up dynamically. + */ +export function createHttpTransport( + options: HttpTransportOptions, + getState: () => HttpTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, +): TransformStream { + const logger = log(); + + return new TransformStream( + { + async transform(chunk, controller) { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + if (!state.overlapSpeechStartedAt) return; + + try { + const resp = await predictHTTP( + chunk, + { threshold: options.threshold, minFrames: options.minFrames }, + { + baseUrl: options.baseUrl, + timeout: options.timeout, + maxRetries: options.maxRetries, + token: await createAccessToken(options.apiKey, options.apiSecret), + }, + ); + + const { createdAt, isBargein, probabilities, predictionDurationInS } = resp; + const entry = new InterruptionCacheEntry({ + createdAt, + probabilities, + isInterruption: isBargein, + speechInput: chunk, + totalDurationInS: (performance.now() - createdAt) / 1000, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + predictionDurationInS, + }); + state.cache.set(createdAt, entry); + + if (state.overlapSpeechStarted && entry.isInterruption) { + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + overlapSpeechStartedAt: state.overlapSpeechStartedAt, + isInterruption: entry.isInterruption, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + }; + logger.debug( + { + detectionDelayInS: entry.detectionDelayInS, + totalDurationInS: entry.totalDurationInS, + }, + 'interruption detected', + ); + setState({ overlapSpeechStarted: false }); + controller.enqueue(event); + } + } catch (err) { + logger.error({ err }, 'Failed to send audio data over HTTP'); + } + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); +} diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts new file mode 100644 index 000000000..f6f083f38 --- /dev/null +++ b/agents/src/inference/interruption/types.ts @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { Span } from '@opentelemetry/api'; + +/** + * Event types for interruption detection. + */ +export enum InterruptionEventType { + INTERRUPTION = 'interruption', + OVERLAP_SPEECH_ENDED = 'overlap_speech_ended', +} + +/** + * Event emitted when an interruption is detected or overlap speech ends. + */ +export interface InterruptionEvent { + type: InterruptionEventType; + timestamp: number; + isInterruption: boolean; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + overlapSpeechStartedAt?: number; + speechInput?: Int16Array; + probabilities?: number[]; + probability: number; +} + +/** + * Configuration options for interruption detection. + */ +export interface InterruptionOptions { + sampleRate: number; + threshold: number; + minFrames: number; + maxAudioDurationInS: number; + audioPrefixDurationInS: number; + detectionIntervalInS: number; + inferenceTimeout: number; + minInterruptionDurationInS: number; + baseUrl: string; + apiKey: string; + apiSecret: string; + useProxy: boolean; +} + +/** + * API connection options for transport layers. + */ +export interface ApiConnectOptions { + maxRetries: number; + retryInterval: number; + timeout: number; +} + +// Sentinel types for stream control signals + +export interface AgentSpeechStarted { + type: 'agent-speech-started'; +} + +export interface AgentSpeechEnded { + type: 'agent-speech-ended'; +} + +export interface OverlapSpeechStarted { + type: 'overlap-speech-started'; + speechDurationInS: number; + userSpeakingSpan: Span; +} + +export interface OverlapSpeechEnded { + type: 'overlap-speech-ended'; +} + +export interface Flush { + type: 'flush'; +} + +/** + * Union type for all stream control signals. + */ +export type InterruptionSentinel = + | AgentSpeechStarted + | AgentSpeechEnded + | OverlapSpeechStarted + | OverlapSpeechEnded + | Flush; diff --git a/agents/src/inference/interruption/utils.test.ts b/agents/src/inference/interruption/utils.test.ts new file mode 100644 index 000000000..762bc5ea3 --- /dev/null +++ b/agents/src/inference/interruption/utils.test.ts @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { slidingWindowMinMax } from './utils.js'; + +describe('slidingWindowMinMax', () => { + it('returns -Infinity when array is shorter than window size', () => { + expect(slidingWindowMinMax([0.5, 0.6], 3)).toBe(-Infinity); + expect(slidingWindowMinMax([], 1)).toBe(-Infinity); + }); + + it('returns the max value when window size is 1', () => { + // With window size 1, min of each window is the element itself, + // so max of mins is just the max of the array + expect(slidingWindowMinMax([0.1, 0.5, 0.3, 0.8, 0.2], 1)).toBe(0.8); + }); + + it('finds the best sustained probability across windows', () => { + // Windows of size 3: [0.2, 0.8, 0.7], [0.8, 0.7, 0.3], [0.7, 0.3, 0.9] + // Mins: 0.2, 0.3, 0.3 + // Max of mins: 0.3 + expect(slidingWindowMinMax([0.2, 0.8, 0.7, 0.3, 0.9], 3)).toBe(0.3); + }); + + it('returns the single element when array length equals window size', () => { + // Only one window covering the entire array, return min of that window + expect(slidingWindowMinMax([0.5, 0.9, 0.7], 3)).toBe(0.5); + expect(slidingWindowMinMax([0.8], 1)).toBe(0.8); + }); +}); diff --git a/agents/src/inference/interruption/utils.ts b/agents/src/inference/interruption/utils.ts new file mode 100644 index 000000000..0c5a4bf40 --- /dev/null +++ b/agents/src/inference/interruption/utils.ts @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; + +/** + * A bounded cache that automatically evicts the oldest entries when the cache exceeds max size. + * Uses FIFO eviction strategy. + */ +export class BoundedCache { + private cache: Map = new Map(); + private readonly maxLen: number; + + constructor(maxLen: number = 10) { + this.maxLen = maxLen; + } + + set(key: K, value: V): void { + this.cache.set(key, value); + if (this.cache.size > this.maxLen) { + // Remove the oldest entry (first inserted) + const firstKey = this.cache.keys().next().value as K; + this.cache.delete(firstKey); + } + } + + get(key: K): V | undefined { + return this.cache.get(key); + } + + has(key: K): boolean { + return this.cache.has(key); + } + + delete(key: K): boolean { + return this.cache.delete(key); + } + + /** + * Get existing entry and update it, or create a new one using factory. + * Updates the entry with the provided partial fields. + */ + setOrUpdate( + key: K, + factory: () => T, + updates: Partial<{ [P in keyof T]: T[P] }>, + ): T { + let entry = this.cache.get(key) as T | undefined; + if (entry === undefined) { + entry = factory(); + this.set(key, entry); + } + // Apply updates to the entry + for (const [field, value] of Object.entries(updates)) { + if (value !== undefined) { + (entry as Record)[field] = value; + } + } + return entry; + } + + /** + * Pop the last entry that matches the predicate, or return undefined. + * Only removes and returns the matching entry, preserving others. + */ + pop(predicate?: (value: V) => boolean): V | undefined { + if (predicate === undefined) { + // Pop the last (most recent) entry + const keys = Array.from(this.cache.keys()); + if (keys.length === 0) return undefined; + const lastKey = keys[keys.length - 1]!; + const value = this.cache.get(lastKey); + this.cache.delete(lastKey); + return value; + } + + // Find the last entry matching the predicate (iterating in reverse) + const keys = Array.from(this.cache.keys()); + for (let i = keys.length - 1; i >= 0; i--) { + const key = keys[i]!; + const value = this.cache.get(key)!; + if (predicate(value)) { + this.cache.delete(key); + return value; + } + } + return undefined; + } + + clear(): void { + this.cache.clear(); + } + + get size(): number { + return this.cache.size; + } + + values(): IterableIterator { + return this.cache.values(); + } + + keys(): IterableIterator { + return this.cache.keys(); + } + + entries(): IterableIterator<[K, V]> { + return this.cache.entries(); + } +} + +/** + * Estimate probability using sliding window min-max algorithm. + * Returns a conservative estimate based on the minimum window size. + */ +export function estimateProbability( + probabilities: number[], + windowSizeInS: number = MIN_INTERRUPTION_DURATION_IN_S, +): number { + const minWindow = Math.ceil(windowSizeInS / FRAME_DURATION_IN_S); + if (probabilities.length < minWindow) { + return 0; + } + + return slidingWindowMinMax(probabilities, minWindow); +} + +export function slidingWindowMinMax(probabilities: number[], minWindow: number): number { + if (probabilities.length < minWindow) { + return -Infinity; + } + + let maxOfMins = -Infinity; + + for (let i = 0; i <= probabilities.length - minWindow; i++) { + const windowMin = Math.min(...probabilities.slice(i, i + minWindow)); + maxOfMins = Math.max(maxOfMins, windowMin); + } + + return maxOfMins; +} diff --git a/agents/src/inference/interruption/ws_transport.test.ts b/agents/src/inference/interruption/ws_transport.test.ts new file mode 100644 index 000000000..e44f62fdb --- /dev/null +++ b/agents/src/inference/interruption/ws_transport.test.ts @@ -0,0 +1,243 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { WebSocket, WebSocketServer } from 'ws'; +import { webSocketToStream } from './ws_transport.js'; + +/** Helper to create a WebSocket server and return its port */ +async function createServer(): Promise<{ wss: WebSocketServer; port: number }> { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + const port = (wss.address() as { port: number }).port; + return { wss, port }; +} + +/** Helper to create a connected WebSocket client */ +async function createClient(port: number): Promise { + const ws = new WebSocket(`ws://localhost:${port}`); + // await new Promise((resolve, reject) => { + // ws.once('open', resolve); + // ws.once('error', reject); + // }); + return ws; +} + +describe('webSocketToStream', () => { + describe('readable stream', () => { + it('receives messages from the WebSocket', async () => { + const { wss, port } = await createServer(); + + wss.on('connection', (serverWs) => { + serverWs.send('hello'); + serverWs.send('world'); + serverWs.close(); + }); + + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); + const reader = readable.getReader(); + + const messages: string[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + messages.push(Buffer.from(value).toString()); + } + } finally { + reader.releaseLock(); + } + + expect(messages).toEqual(['hello', 'world']); + + wss.close(); + }); + + it('handles binary messages', async () => { + const { wss, port } = await createServer(); + + const binaryData = new Uint8Array([1, 2, 3, 4, 5]); + + wss.on('connection', (serverWs) => { + serverWs.send(binaryData); + serverWs.close(); + }); + + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); + const reader = readable.getReader(); + + const chunks: Uint8Array[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(new Uint8Array(value)); + } + } finally { + reader.releaseLock(); + } + + expect(chunks).toHaveLength(1); + expect(Array.from(chunks[0]!)).toEqual([1, 2, 3, 4, 5]); + + wss.close(); + }); + + it('handles empty stream when connection closes immediately', async () => { + const { wss, port } = await createServer(); + + wss.on('connection', (serverWs) => { + serverWs.close(); + }); + + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); + const reader = readable.getReader(); + + const chunks: Uint8Array[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(value); + } + } finally { + reader.releaseLock(); + } + + expect(chunks).toEqual([]); + + wss.close(); + }); + }); + + describe('writable stream', () => { + it('sends messages through the WebSocket', async () => { + const { wss, port } = await createServer(); + + const messagesReceived: string[] = []; + const serverClosed = new Promise((resolve) => { + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + messagesReceived.push(data.toString()); + }); + serverWs.on('close', resolve); + }); + }); + + const ws = await createClient(port); + const { writable } = webSocketToStream(ws); + const writer = writable.getWriter(); + + await writer.write(new TextEncoder().encode('hello')); + await writer.write(new TextEncoder().encode('world')); + await writer.close(); + + await serverClosed; + + expect(messagesReceived).toEqual(['hello', 'world']); + + wss.close(); + }); + + it('sends binary data through the WebSocket', async () => { + const { wss, port } = await createServer(); + + const chunksReceived: Buffer[] = []; + const serverClosed = new Promise((resolve) => { + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + chunksReceived.push(Buffer.from(data as Buffer)); + }); + serverWs.on('close', resolve); + }); + }); + + const ws = await createClient(port); + const { writable } = webSocketToStream(ws); + const writer = writable.getWriter(); + + const binaryData = new Uint8Array([10, 20, 30, 40, 50]); + await writer.write(binaryData); + await writer.close(); + + await serverClosed; + + expect(chunksReceived).toHaveLength(1); + expect(Array.from(chunksReceived[0]!)).toEqual([10, 20, 30, 40, 50]); + + wss.close(); + }); + }); + + describe('bidirectional communication', () => { + it('supports echo pattern with readable and writable', async () => { + const { wss, port } = await createServer(); + + // Server echoes messages back + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + serverWs.send(data); + }); + }); + + const ws = await createClient(port); + const { readable, writable } = webSocketToStream(ws); + const writer = writable.getWriter(); + const reader = readable.getReader(); + + // Send messages + await writer.write(new TextEncoder().encode('ping1')); + await writer.write(new TextEncoder().encode('ping2')); + + // Read echoed responses + const { value: response1 } = await reader.read(); + const { value: response2 } = await reader.read(); + + expect(Buffer.from(response1!).toString()).toBe('ping1'); + expect(Buffer.from(response2!).toString()).toBe('ping2'); + + reader.releaseLock(); + await writer.close(); + + wss.close(); + }); + }); + + describe('error handling', () => { + it('readable stream ends when WebSocket closes unexpectedly', async () => { + const { wss, port } = await createServer(); + + wss.on('connection', (serverWs) => { + serverWs.send('before close'); + // Terminate connection abruptly + serverWs.terminate(); + }); + + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); + const reader = readable.getReader(); + + const chunks: string[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(Buffer.from(value).toString()); + } + } catch { + // Connection terminated, stream may error + } finally { + reader.releaseLock(); + } + + // Should have received the message sent before termination + expect(chunks).toContain('before close'); + + wss.close(); + }); + }); +}); diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts new file mode 100644 index 000000000..663a9b08e --- /dev/null +++ b/agents/src/inference/interruption/ws_transport.ts @@ -0,0 +1,387 @@ +import { Readable, Writable } from 'node:stream'; +import { TransformStream } from 'stream/web'; +import WebSocket, { createWebSocketStream } from 'ws'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; +import { intervalForRetry } from './defaults.js'; +import { type InterruptionEvent, InterruptionEventType } from './types.js'; +import type { BoundedCache } from './utils.js'; + +// WebSocket message types +const MSG_SESSION_CREATE = 'session.create'; +const MSG_SESSION_CLOSE = 'session.close'; +const MSG_SESSION_CREATED = 'session.created'; +const MSG_SESSION_CLOSED = 'session.closed'; +const MSG_INTERRUPTION_DETECTED = 'bargein_detected'; +const MSG_INFERENCE_DONE = 'inference_done'; +const MSG_ERROR = 'error'; + +export interface WsTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; + maxRetries?: number; +} + +export interface WsTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: BoundedCache; +} + +interface WsMessage { + type: string; + created_at?: number; + probabilities?: number[]; + prediction_duration?: number; + is_bargein?: boolean; + error?: string; +} + +export function webSocketToStream(ws: WebSocket) { + const duplex = createWebSocketStream(ws); + duplex.on('error', (err) => log().error({ err }, 'WebSocket stream error')); + + // End the write side when the read side ends + duplex.on('end', () => duplex.end()); + + const writable = Writable.toWeb(duplex) as WritableStream; + const readable = Readable.toWeb(duplex) as ReadableStream; + + return { readable, writable }; +} + +/** + * Creates a WebSocket connection and returns web-standard streams. + */ +async function connectWebSocket(options: WsTransportOptions): Promise<{ + readable: ReadableStream; + writable: WritableStream; + ws: WebSocket; +}> { + const baseUrl = options.baseUrl.replace(/^http/, 'ws'); + const url = `${baseUrl}/bargein`; + const token = await createAccessToken(options.apiKey, options.apiSecret); + + const ws = new WebSocket(url, { + headers: { Authorization: `Bearer ${token}` }, + }); + + const { readable, writable } = webSocketToStream(ws); + + await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + ws.terminate(); + reject(new Error('WebSocket connection timeout')); + }, options.timeout); + ws.once('open', () => { + clearTimeout(timeout); + resolve(); + }); + ws.once('error', (err) => { + clearTimeout(timeout); + ws.terminate(); + reject(err); + }); + }); + + return { readable, writable, ws }; +} + +export interface WsTransportResult { + transport: TransformStream; + reconnect: () => Promise; +} + +/** + * Creates a WebSocket transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * It maintains a persistent WebSocket connection with automatic retry on failure. + * Returns both the transport and a reconnect function for option updates. + */ +export function createWsTransport( + options: WsTransportOptions, + getState: () => WsTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, +): WsTransportResult { + const logger = log(); + let ws: WebSocket | null = null; + let writer: WritableStreamDefaultWriter | null = null; + let readerTask: Promise | null = null; + let outputController: TransformStreamDefaultController | null = null; + + async function ensureConnection(): Promise { + if (ws && ws.readyState === WebSocket.OPEN) return; + + const maxRetries = options.maxRetries ?? 3; + let lastError: Error | null = null; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + const conn = await connectWebSocket(options); + ws = conn.ws; + writer = conn.writable.getWriter(); + + // Send session.create message + const sessionCreateMsg = JSON.stringify({ + type: MSG_SESSION_CREATE, + settings: { + sample_rate: options.sampleRate, + num_channels: 1, + threshold: options.threshold, + min_frames: options.minFrames, + encoding: 's16le', + }, + }); + await writer.write(new TextEncoder().encode(sessionCreateMsg)); + + // Start reading responses + readerTask = processResponses(conn.readable); + return; + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + if (attempt < maxRetries) { + const delay = intervalForRetry(attempt); + logger.warn( + { attempt, delay, err: lastError.message }, + 'WebSocket connection failed, retrying', + ); + await new Promise((resolve) => setTimeout(resolve, delay)); + } + } + } + + throw lastError ?? new Error('Failed to connect to WebSocket after retries'); + } + + async function processResponses(readable: ReadableStream): Promise { + const reader = readable.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + + // Process complete JSON messages (newline-delimited or single messages) + const lines = buffer.split('\n'); + buffer = lines.pop() ?? ''; + + for (const line of lines) { + if (line.trim()) { + try { + const message: WsMessage = JSON.parse(line); + handleMessage(message); + } catch { + logger.warn({ line }, 'Failed to parse WebSocket message'); + } + } + } + + // Also try parsing buffer as complete message (for non-newline-delimited) + if (buffer.trim()) { + try { + const message: WsMessage = JSON.parse(buffer); + handleMessage(message); + buffer = ''; + } catch { + // Incomplete message, keep buffering + } + } + } + } finally { + reader.releaseLock(); + } + } + + function handleMessage(message: WsMessage): void { + const state = getState(); + + switch (message.type) { + case MSG_SESSION_CREATED: + logger.debug('WebSocket session created'); + break; + + case MSG_INTERRUPTION_DETECTED: { + const createdAt = message.created_at ?? 0; + if (state.overlapSpeechStarted && state.overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const entry = new InterruptionCacheEntry({ + createdAt, + speechInput: existing?.speechInput, + totalDurationInS: (performance.now() - createdAt) / 1000, + probabilities: message.probabilities, + isInterruption: true, + predictionDurationInS: message.prediction_duration ?? 0, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + }); + state.cache.set(createdAt, entry); + + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + + logger.debug( + { + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + }, + 'interruption detected', + ); + + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + isInterruption: true, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + overlapSpeechStartedAt: state.overlapSpeechStartedAt, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + }; + + outputController?.enqueue(event); + setState({ overlapSpeechStarted: false }); + } + break; + } + + case MSG_INFERENCE_DONE: { + const createdAt = message.created_at ?? 0; + if (state.overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const entry = new InterruptionCacheEntry({ + createdAt, + speechInput: existing?.speechInput, + totalDurationInS: (performance.now() - createdAt) / 1000, + predictionDurationInS: message.prediction_duration ?? 0, + probabilities: message.probabilities, + isInterruption: message.is_bargein ?? false, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + }); + state.cache.set(createdAt, entry); + + logger.trace( + { + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + }, + 'interruption inference done', + ); + } + break; + } + + case MSG_SESSION_CLOSED: + logger.debug('WebSocket session closed'); + break; + + case MSG_ERROR: + logger.error({ error: message.error }, 'WebSocket error message received'); + outputController?.error(new Error(`LiveKit Interruption error: ${message.error}`)); + break; + + default: + logger.warn({ type: message.type }, 'Received unexpected WebSocket message type'); + } + } + + async function sendAudioData(audioSlice: Int16Array): Promise { + await ensureConnection(); + if (!writer) throw new Error('WebSocket not connected'); + + const state = getState(); + const createdAt = performance.now(); + + // Store the audio data in cache + state.cache.set(createdAt, new InterruptionCacheEntry({ createdAt, speechInput: audioSlice })); + + // Create header: 8-byte little-endian uint64 timestamp (milliseconds as integer) + const header = new ArrayBuffer(8); + const view = new DataView(header); + const createdAtInt = Math.floor(createdAt); + view.setUint32(0, createdAtInt >>> 0, true); + view.setUint32(4, Math.floor(createdAtInt / 0x100000000) >>> 0, true); + + // Combine header and audio data + const audioBytes = new Uint8Array( + audioSlice.buffer, + audioSlice.byteOffset, + audioSlice.byteLength, + ); + const combined = new Uint8Array(8 + audioBytes.length); + combined.set(new Uint8Array(header), 0); + combined.set(audioBytes, 8); + + await writer.write(combined); + } + + async function close(): Promise { + if (writer && ws?.readyState === WebSocket.OPEN) { + const closeMsg = JSON.stringify({ type: MSG_SESSION_CLOSE }); + await writer.write(new TextEncoder().encode(closeMsg)); + writer.releaseLock(); + writer = null; + } + ws?.close(1000); + ws = null; + await readerTask; + readerTask = null; + } + + /** + * Reconnect the WebSocket with updated options. + * This is called when options are updated via updateOptions(). + */ + async function reconnect(): Promise { + await close(); + // Connection will be re-established on next sendAudioData call + } + + const transport = new TransformStream( + { + start(controller) { + outputController = controller; + }, + + async transform(chunk, controller) { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + if (!state.overlapSpeechStartedAt) return; + + try { + await sendAudioData(chunk); + } catch (err) { + logger.error({ err }, 'Failed to send audio data over WebSocket'); + } + }, + + async flush() { + await close(); + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); + + return { transport, reconnect }; +} diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 1fb68bab2..75fcfd6c7 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -4,14 +4,15 @@ import type { ReadableStream } from 'node:stream/web'; import { IdentityTransform } from './identity_transform.js'; -export interface StreamChannel { +export interface StreamChannel { write(chunk: T): Promise; close(): Promise; stream(): ReadableStream; + abort(error: E): Promise; readonly closed: boolean; } -export function createStreamChannel(): StreamChannel { +export function createStreamChannel(): StreamChannel { const transform = new IdentityTransform(); const writer = transform.writable.getWriter(); let isClosed = false; @@ -19,6 +20,10 @@ export function createStreamChannel(): StreamChannel { return { write: (chunk: T) => writer.write(chunk), stream: () => transform.readable, + abort: (error: E) => { + isClosed = true; + return writer.abort(error); + }, close: async () => { try { const result = await writer.close(); diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index db76f7bc1..7220ec03a 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -51,6 +51,13 @@ export const ATTR_TRANSCRIPT_CONFIDENCE = 'lk.transcript_confidence'; export const ATTR_TRANSCRIPTION_DELAY = 'lk.transcription_delay'; export const ATTR_END_OF_TURN_DELAY = 'lk.end_of_turn_delay'; +// Adaptive Interruption attributes +export const ATTR_IS_INTERRUPTION = 'lk.is_interruption'; +export const ATTR_INTERRUPTION_PROBABILITY = 'lk.interruption.probability'; +export const ATTR_INTERRUPTION_TOTAL_DURATION = 'lk.interruption.total_duration'; +export const ATTR_INTERRUPTION_PREDICTION_DURATION = 'lk.interruption.prediction_duration'; +export const ATTR_INTERRUPTION_DETECTION_DELAY = 'lk.interruption.detection_delay'; + // metrics export const ATTR_LLM_METRICS = 'lk.llm_metrics'; export const ATTR_TTS_METRICS = 'lk.tts_metrics'; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 935381ff8..f0ba5db02 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -169,6 +169,9 @@ importers: livekit-server-sdk: specifier: ^2.14.1 version: 2.14.1 + ofetch: + specifier: ^1.5.1 + version: 1.5.1 openai: specifier: ^6.8.1 version: 6.8.1(ws@8.18.3)(zod@3.25.76) @@ -3281,6 +3284,9 @@ packages: resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} engines: {node: '>=6'} + destr@2.0.5: + resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} + detect-indent@6.1.0: resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} engines: {node: '>=8'} @@ -4448,6 +4454,9 @@ packages: engines: {node: '>=10.5.0'} deprecated: Use your platform's native DOMException instead + node-fetch-native@1.6.7: + resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==} + node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -4507,6 +4516,9 @@ packages: obug@2.1.1: resolution: {integrity: sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==} + ofetch@1.5.1: + resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==} + on-exit-leak-free@2.1.2: resolution: {integrity: sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA==} engines: {node: '>=14.0.0'} @@ -5378,6 +5390,9 @@ packages: ufo@1.5.3: resolution: {integrity: sha512-Y7HYmWaFwPUmkoQCUIAYpKqkOf+SbVj/2fJJZ4RJMCfZp0rTGwRbzQD+HghfnhKOjL9E01okqz+ncJskGYfBNw==} + ufo@1.6.3: + resolution: {integrity: sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q==} + unbox-primitive@1.0.2: resolution: {integrity: sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw==} @@ -7919,6 +7934,8 @@ snapshots: dequal@2.0.3: {} + destr@2.0.5: {} + detect-indent@6.1.0: {} detect-libc@2.0.4: {} @@ -9298,6 +9315,8 @@ snapshots: node-domexception@1.0.0: {} + node-fetch-native@1.6.7: {} + node-fetch@2.7.0: dependencies: whatwg-url: 5.0.0 @@ -9360,6 +9379,12 @@ snapshots: obug@2.1.1: {} + ofetch@1.5.1: + dependencies: + destr: 2.0.5 + node-fetch-native: 1.6.7 + ufo: 1.6.3 + on-exit-leak-free@2.1.2: {} once@1.4.0: @@ -10409,6 +10434,8 @@ snapshots: ufo@1.5.3: {} + ufo@1.6.3: {} + unbox-primitive@1.0.2: dependencies: call-bind: 1.0.7 From c861f504b6684608559ffab321916cc47000e421 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 29 Jan 2026 10:47:53 +0100 Subject: [PATCH 03/26] Add agent activity interruption detector integration (#991) Co-authored-by: Brian Yin <57741529+Toubat@users.noreply.github.com> --- .github/workflows/test.yml | 2 +- agents/src/index.ts | 2 - .../AdaptiveInterruptionDetector.ts | 7 +- .../interruption/InterruptionStream.ts | 10 +- .../inference/interruption/http_transport.ts | 47 +-- agents/src/inference/interruption/types.ts | 2 +- .../inference/interruption/ws_transport.ts | 187 +++++------ agents/src/stream/stream_channel.ts | 15 + agents/src/voice/agent.ts | 37 ++- agents/src/voice/agent_activity.ts | 130 +++++++- agents/src/voice/agent_session.ts | 78 +++-- agents/src/voice/audio_recognition.ts | 314 +++++++++++++++++- agents/src/voice/events.ts | 2 + agents/src/voice/turn_config/utils.ts | 31 +- examples/src/basic_agent.ts | 22 +- 15 files changed, 674 insertions(+), 212 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5a577688..9c09430b3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -47,7 +47,7 @@ jobs: if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name == 'push' run: pnpm test agents - name: Test examples - if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') && secrets.OPENAI_API_KEY != '' + if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: pnpm test:examples diff --git a/agents/src/index.ts b/agents/src/index.ts index e4fd2859b..57ace0c7a 100644 --- a/agents/src/index.ts +++ b/agents/src/index.ts @@ -36,6 +36,4 @@ export * from './vad.js'; export * from './version.js'; export * from './worker.js'; -export * from './inference/interruption/index.js'; - export { cli, inference, ipc, llm, metrics, stream, stt, telemetry, tokenize, tts, voice }; diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index eb27a2482..33f526fcc 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -15,9 +15,8 @@ import type { InterruptionDetectionError } from './errors.js'; import type { InterruptionEvent, InterruptionOptions } from './types.js'; type InterruptionCallbacks = { - userInterruptionDetected: (event: InterruptionEvent) => void; - userNonInterruptionDetected: (event: InterruptionEvent) => void; - overlapSpeechEnded: (event: InterruptionEvent) => void; + user_interruption_detected: (event: InterruptionEvent) => void; + user_non_interruption_detected: (event: InterruptionEvent) => void; error: (error: InterruptionDetectionError) => void; }; @@ -74,10 +73,8 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ 'apiSecret is required, either as argument or set LIVEKIT_API_SECRET environmental variable', ); } - useProxy = true; } else { - // Force useProxy to false for custom URLs (matching Python behavior) useProxy = false; } diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index bdd9b178c..a91cbe1f6 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -39,17 +39,17 @@ export type { }; export class InterruptionStreamSentinel { - static speechStarted(): AgentSpeechStarted { + static agentSpeechStarted(): AgentSpeechStarted { return { type: 'agent-speech-started' }; } - static speechEnded(): AgentSpeechEnded { + static agentSpeechEnded(): AgentSpeechEnded { return { type: 'agent-speech-ended' }; } static overlapSpeechStarted( speechDurationInS: number, - userSpeakingSpan: Span, + userSpeakingSpan?: Span, ): OverlapSpeechStarted { return { type: 'overlap-speech-started', speechDurationInS, userSpeakingSpan }; } @@ -302,9 +302,9 @@ export class InterruptionStreamBase { const eventEmitter = new TransformStream({ transform: (chunk, controller) => { if (chunk.type === InterruptionEventType.INTERRUPTION) { - this.model.emit('userInterruptionDetected', chunk); + this.model.emit('user_interruption_detected', chunk); } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { - this.model.emit('overlapSpeechEnded', chunk); + this.model.emit('user_non_interruption_detected', chunk); } controller.enqueue(chunk); }, diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 25f8b7c25..82ee8b2a0 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 import { ofetch } from 'ofetch'; import { TransformStream } from 'stream/web'; +import { z } from 'zod'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; @@ -23,11 +24,13 @@ export interface PredictOptions { minFrames: number; } -export interface PredictEndpointResponse { - created_at: number; - is_bargein: boolean; - probabilities: number[]; -} +export const predictEndpointResponseSchema = z.object({ + created_at: z.number(), + is_bargein: z.boolean(), + probabilities: z.array(z.number()), +}); + +export type PredictEndpointResponse = z.infer; export interface PredictResponse { createdAt: number; @@ -48,25 +51,23 @@ export async function predictHTTP( url.searchParams.append('created_at', createdAt.toFixed()); let retryCount = 0; - const { created_at, is_bargein, probabilities } = await ofetch( - url.toString(), - { - retry: options.maxRetries ?? 3, - retryDelay: () => { - const delay = intervalForRetry(retryCount); - retryCount++; - return delay; - }, - headers: { - 'Content-Type': 'application/octet-stream', - Authorization: `Bearer ${options.token}`, - }, - signal: options.signal, - timeout: options.timeout, - method: 'POST', - body: data, + const response = await ofetch(url.toString(), { + retry: options.maxRetries ?? 3, + retryDelay: () => { + const delay = intervalForRetry(retryCount); + retryCount++; + return delay; }, - ); + headers: { + 'Content-Type': 'application/octet-stream', + Authorization: `Bearer ${options.token}`, + }, + signal: options.signal, + timeout: options.timeout, + method: 'POST', + body: data, + }); + const { created_at, is_bargein, probabilities } = predictEndpointResponseSchema.parse(response); return { createdAt: created_at, diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts index f6f083f38..0bc17dd6f 100644 --- a/agents/src/inference/interruption/types.ts +++ b/agents/src/inference/interruption/types.ts @@ -67,7 +67,7 @@ export interface AgentSpeechEnded { export interface OverlapSpeechStarted { type: 'overlap-speech-started'; speechDurationInS: number; - userSpeakingSpan: Span; + userSpeakingSpan?: Span; } export interface OverlapSpeechEnded { diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 663a9b08e..94137a622 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -1,6 +1,6 @@ -import { Readable, Writable } from 'node:stream'; import { TransformStream } from 'stream/web'; -import WebSocket, { createWebSocketStream } from 'ws'; +import WebSocket from 'ws'; +import { z } from 'zod'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; @@ -34,46 +34,37 @@ export interface WsTransportState { cache: BoundedCache; } -interface WsMessage { - type: string; - created_at?: number; - probabilities?: number[]; - prediction_duration?: number; - is_bargein?: boolean; - error?: string; -} - -export function webSocketToStream(ws: WebSocket) { - const duplex = createWebSocketStream(ws); - duplex.on('error', (err) => log().error({ err }, 'WebSocket stream error')); - - // End the write side when the read side ends - duplex.on('end', () => duplex.end()); - - const writable = Writable.toWeb(duplex) as WritableStream; - const readable = Readable.toWeb(duplex) as ReadableStream; - - return { readable, writable }; -} +const wsMessageSchema = z.union([ + z.object({ + type: z.literal(MSG_SESSION_CREATED).or(z.literal(MSG_SESSION_CLOSED)), + }), + z.object({ + type: z.literal(MSG_INTERRUPTION_DETECTED).or(z.literal(MSG_INFERENCE_DONE)), + created_at: z.number().optional(), + probabilities: z.array(z.number()).optional(), + prediction_duration: z.number().optional(), + is_bargein: z.boolean().optional(), + }), + z.object({ + type: z.literal('error'), + message: z.string(), + }), +]); + +type WsMessage = z.infer; /** - * Creates a WebSocket connection and returns web-standard streams. + * Creates a WebSocket connection and waits for it to open. */ -async function connectWebSocket(options: WsTransportOptions): Promise<{ - readable: ReadableStream; - writable: WritableStream; - ws: WebSocket; -}> { +async function connectWebSocket(options: WsTransportOptions): Promise { const baseUrl = options.baseUrl.replace(/^http/, 'ws'); - const url = `${baseUrl}/bargein`; const token = await createAccessToken(options.apiKey, options.apiSecret); + const url = `${baseUrl}/bargein`; const ws = new WebSocket(url, { headers: { Authorization: `Bearer ${token}` }, }); - const { readable, writable } = webSocketToStream(ws); - await new Promise((resolve, reject) => { const timeout = setTimeout(() => { ws.terminate(); @@ -83,14 +74,14 @@ async function connectWebSocket(options: WsTransportOptions): Promise<{ clearTimeout(timeout); resolve(); }); - ws.once('error', (err) => { + ws.once('error', (err: Error) => { clearTimeout(timeout); ws.terminate(); reject(err); }); }); - return { readable, writable, ws }; + return ws; } export interface WsTransportResult { @@ -113,10 +104,27 @@ export function createWsTransport( ): WsTransportResult { const logger = log(); let ws: WebSocket | null = null; - let writer: WritableStreamDefaultWriter | null = null; - let readerTask: Promise | null = null; let outputController: TransformStreamDefaultController | null = null; + function setupMessageHandler(socket: WebSocket): void { + socket.on('message', (data: WebSocket.Data) => { + try { + const message = wsMessageSchema.parse(JSON.parse(data.toString())); + handleMessage(message); + } catch { + logger.warn({ data: data.toString() }, 'Failed to parse WebSocket message'); + } + }); + + socket.on('error', (err: Error) => { + logger.error({ err }, 'WebSocket error'); + }); + + socket.on('close', (code: number, reason: Buffer) => { + logger.debug({ code, reason: reason.toString() }, 'WebSocket closed'); + }); + } + async function ensureConnection(): Promise { if (ws && ws.readyState === WebSocket.OPEN) return; @@ -125,9 +133,8 @@ export function createWsTransport( for (let attempt = 0; attempt <= maxRetries; attempt++) { try { - const conn = await connectWebSocket(options); - ws = conn.ws; - writer = conn.writable.getWriter(); + ws = await connectWebSocket(options); + setupMessageHandler(ws); // Send session.create message const sessionCreateMsg = JSON.stringify({ @@ -140,16 +147,13 @@ export function createWsTransport( encoding: 's16le', }, }); - await writer.write(new TextEncoder().encode(sessionCreateMsg)); - - // Start reading responses - readerTask = processResponses(conn.readable); + ws.send(sessionCreateMsg); return; } catch (err) { lastError = err instanceof Error ? err : new Error(String(err)); if (attempt < maxRetries) { const delay = intervalForRetry(attempt); - logger.warn( + logger.debug( { attempt, delay, err: lastError.message }, 'WebSocket connection failed, retrying', ); @@ -161,49 +165,6 @@ export function createWsTransport( throw lastError ?? new Error('Failed to connect to WebSocket after retries'); } - async function processResponses(readable: ReadableStream): Promise { - const reader = readable.getReader(); - const decoder = new TextDecoder(); - let buffer = ''; - - try { - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - buffer += decoder.decode(value, { stream: true }); - - // Process complete JSON messages (newline-delimited or single messages) - const lines = buffer.split('\n'); - buffer = lines.pop() ?? ''; - - for (const line of lines) { - if (line.trim()) { - try { - const message: WsMessage = JSON.parse(line); - handleMessage(message); - } catch { - logger.warn({ line }, 'Failed to parse WebSocket message'); - } - } - } - - // Also try parsing buffer as complete message (for non-newline-delimited) - if (buffer.trim()) { - try { - const message: WsMessage = JSON.parse(buffer); - handleMessage(message); - buffer = ''; - } catch { - // Incomplete message, keep buffering - } - } - } - } finally { - reader.releaseLock(); - } - } - function handleMessage(message: WsMessage): void { const state = getState(); @@ -275,7 +236,7 @@ export function createWsTransport( }); state.cache.set(createdAt, entry); - logger.trace( + logger.debug( { totalDurationInS: entry.totalDurationInS, predictionDurationInS: entry.predictionDurationInS, @@ -291,18 +252,15 @@ export function createWsTransport( break; case MSG_ERROR: - logger.error({ error: message.error }, 'WebSocket error message received'); - outputController?.error(new Error(`LiveKit Interruption error: ${message.error}`)); + outputController?.error(new Error(`LiveKit Interruption error: ${message.message}`)); break; - - default: - logger.warn({ type: message.type }, 'Received unexpected WebSocket message type'); } } - async function sendAudioData(audioSlice: Int16Array): Promise { - await ensureConnection(); - if (!writer) throw new Error('WebSocket not connected'); + function sendAudioData(audioSlice: Int16Array): void { + if (!ws || ws.readyState !== WebSocket.OPEN) { + throw new Error('WebSocket not connected'); + } const state = getState(); const createdAt = performance.now(); @@ -327,20 +285,24 @@ export function createWsTransport( combined.set(new Uint8Array(header), 0); combined.set(audioBytes, 8); - await writer.write(combined); + try { + ws.send(combined); + } catch (e: unknown) { + logger.error(e, `failed to send audio via websocket`); + } } - async function close(): Promise { - if (writer && ws?.readyState === WebSocket.OPEN) { + function close(): void { + if (ws?.readyState === WebSocket.OPEN) { const closeMsg = JSON.stringify({ type: MSG_SESSION_CLOSE }); - await writer.write(new TextEncoder().encode(closeMsg)); - writer.releaseLock(); - writer = null; + try { + ws.send(closeMsg); + } catch (e: unknown) { + logger.error(e, 'failed to send close message'); + } } - ws?.close(1000); + ws?.close(1000); // signal normal websocket closure ws = null; - await readerTask; - readerTask = null; } /** @@ -348,17 +310,18 @@ export function createWsTransport( * This is called when options are updated via updateOptions(). */ async function reconnect(): Promise { - await close(); - // Connection will be re-established on next sendAudioData call + close(); + // Connection will be re-established on next ensureConnection call } const transport = new TransformStream( { - start(controller) { + async start(controller) { outputController = controller; + await ensureConnection(); }, - async transform(chunk, controller) { + transform(chunk, controller) { // Pass through InterruptionEvents unchanged if (!(chunk instanceof Int16Array)) { controller.enqueue(chunk); @@ -369,14 +332,14 @@ export function createWsTransport( if (!state.overlapSpeechStartedAt) return; try { - await sendAudioData(chunk); + sendAudioData(chunk); } catch (err) { logger.error({ err }, 'Failed to send audio data over WebSocket'); } }, - async flush() { - await close(); + flush() { + close(); }, }, { highWaterMark: 2 }, diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 75fcfd6c7..67364e201 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -10,6 +10,7 @@ export interface StreamChannel { stream(): ReadableStream; abort(error: E): Promise; readonly closed: boolean; + addStreamInput(stream: ReadableStream): void; } export function createStreamChannel(): StreamChannel { @@ -24,6 +25,20 @@ export function createStreamChannel(): StreamChannel isClosed = true; return writer.abort(error); }, + addStreamInput: (newInputStream) => { + const reader = newInputStream.getReader(); + (async () => { + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + await writer.write(value); + } + } finally { + reader.releaseLock(); + } + })(); + }, close: async () => { try { const result = await writer.close(); diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index ae116ae73..f587adbab 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -29,6 +29,8 @@ import { SynthesizeStream, StreamAdapter as TTSStreamAdapter } from '../tts/inde import type { VAD } from '../vad.js'; import type { AgentActivity } from './agent_activity.js'; import type { AgentSession, TurnDetectionMode } from './agent_session.js'; +import type { InterruptionConfig } from './turn_config/interruption.js'; +import type { TurnHandlingConfig } from './turn_config/turnHandling.js'; export const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>(); export const STOP_RESPONSE_SYMBOL = Symbol('StopResponse'); @@ -63,6 +65,7 @@ export interface AgentOptions { instructions: string; chatCtx?: ChatContext; tools?: ToolContext; + /** @deprecated use turnHandling instead */ turnDetection?: TurnDetectionMode; stt?: STT | STTModelString; vad?: VAD; @@ -70,6 +73,7 @@ export interface AgentOptions { tts?: TTS | TTSModelString; allowInterruptions?: boolean; minConsecutiveSpeechDelay?: number; + turnHandling?: TurnHandlingConfig; } export class Agent { @@ -79,6 +83,9 @@ export class Agent { private _vad?: VAD; private _llm?: LLM | RealtimeModel; private _tts?: TTS; + private turnHandling?: TurnHandlingConfig; + private _interruptionDetection: InterruptionConfig['mode']; + private _allowInterruptions?: boolean; /** @internal */ _agentActivity?: AgentActivity; @@ -92,17 +99,8 @@ export class Agent { /** @internal */ _tools?: ToolContext; - constructor({ - id, - instructions, - chatCtx, - tools, - turnDetection, - stt, - vad, - llm, - tts, - }: AgentOptions) { + constructor(options: AgentOptions) { + const { id, instructions, chatCtx, tools, stt, vad, llm, tts, turnHandling } = options; if (id) { this._id = id; } else { @@ -126,7 +124,9 @@ export class Agent { }) : ChatContext.empty(); - this.turnDetection = turnDetection; + this.turnHandling = turnHandling; // TODO migrate legacy options to new turn handling config when turnConfig is unset + + this.turnDetection = this.turnHandling?.turnDetection; this._vad = vad; if (typeof stt === 'string') { @@ -147,6 +147,11 @@ export class Agent { this._tts = tts; } + this._interruptionDetection = this.turnHandling?.interruption.mode; + if (this.turnHandling?.interruption.mode !== undefined) { + this._allowInterruptions = !!this.turnHandling.interruption.mode; + } + this._agentActivity = undefined; } @@ -186,6 +191,14 @@ export class Agent { return this.getActivityOrThrow().agentSession as AgentSession; } + get interruptionDetection(): InterruptionConfig['mode'] { + return this._interruptionDetection; + } + + get allowInterruptions(): boolean | undefined { + return this._allowInterruptions; + } + async onEnter(): Promise {} async onExit(): Promise {} diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 2cc66449a..e9be7e90c 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -8,6 +8,8 @@ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api' import { Heap } from 'heap-js'; import { AsyncLocalStorage } from 'node:async_hooks'; import { ReadableStream } from 'node:stream/web'; +import { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; +import type { InterruptionEvent } from '../inference/interruption/types.js'; import { type ChatContext, ChatMessage } from '../llm/chat_context.js'; import { type ChatItem, @@ -86,13 +88,14 @@ interface PreemptiveGeneration { createdAt: number; } +// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes export class AgentActivity implements RecognitionHooks { private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000; private started = false; private audioRecognition?: AudioRecognition; private realtimeSession?: RealtimeSession; private realtimeSpans?: Map; // Maps response_id to OTEL span for metrics recording - private turnDetectionMode?: Exclude; + private turnDetectionMode?: TurnDetectionMode; private logger = log(); private _draining = false; private _currentSpeech?: SpeechHandle; @@ -104,6 +107,10 @@ export class AgentActivity implements RecognitionHooks { // default to null as None, which maps to the default provider tool choice value private toolChoice: ToolChoice | null = null; private _preemptiveGeneration?: PreemptiveGeneration; + private interruptionDetector?: AdaptiveInterruptionDetector; + private isInterruptionDetectionEnabled: boolean; + private isInterruptionByAudioActivityEnabled: boolean; + private isDefaultInterruptionByAudioActivityEnabled: boolean; agent: Agent; agentSession: AgentSession; @@ -204,6 +211,16 @@ export class AgentActivity implements RecognitionHooks { 'for more responsive interruption handling.', ); } + + this.interruptionDetector = this.resolveInterruptionDetector(); + this.isInterruptionDetectionEnabled = !!this.interruptionDetector; + + // this allows taking over audio interruption temporarily until interruption is detected + // by default is is ture unless turnDetection is manual or realtime_llm + this.isInterruptionByAudioActivityEnabled = + this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm'; + + this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled; } async start(): Promise { @@ -295,6 +312,7 @@ export class AgentActivity implements RecognitionHooks { vad: this.vad, turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, + interruptionDetection: this.interruptionDetector, minEndpointingDelay: this.agentSession.options.minEndpointingDelay, maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, rootSpanContext: this.agentSession.rootSpanContext, @@ -385,7 +403,13 @@ export class AgentActivity implements RecognitionHooks { } } - updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void { + updateOptions({ + toolChoice, + turnDetection, + }: { + toolChoice?: ToolChoice | null; + turnDetection?: TurnDetectionMode; + }): void { if (toolChoice !== undefined) { this.toolChoice = toolChoice; } @@ -393,6 +417,10 @@ export class AgentActivity implements RecognitionHooks { if (this.realtimeSession) { this.realtimeSession.updateOptions({ toolChoice: this.toolChoice }); } + + this.turnDetectionMode = turnDetection; // TODO fix types + this.isDefaultInterruptionByAudioActivityEnabled = + this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm'; } attachAudioInput(audioStream: ReadableStream): void { @@ -549,6 +577,9 @@ export class AgentActivity implements RecognitionHooks { if (!this.vad) { this.agentSession._updateUserState('speaking'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfOverlapSpeech(0, this.agentSession._userSpeakingSpan); + } } // this.interrupt() is going to raise when allow_interruptions is False, @@ -567,6 +598,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfOverlapSpeech(this.agentSession._userSpeakingSpan); + } this.agentSession._updateUserState('listening'); } @@ -644,6 +678,12 @@ export class AgentActivity implements RecognitionHooks { speechStartTime = speechStartTime - ev.speechDuration; } this.agentSession._updateUserState('speaking', speechStartTime); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfOverlapSpeech( + ev.speechDuration, + this.agentSession._userSpeakingSpan, + ); + } } onEndOfSpeech(ev: VADEvent): void { @@ -651,6 +691,9 @@ export class AgentActivity implements RecognitionHooks { if (ev) { speechEndTime = speechEndTime - ev.silenceDuration; } + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfOverlapSpeech(this.agentSession._userSpeakingSpan); + } this.agentSession._updateUserState('listening', speechEndTime); } @@ -666,6 +709,10 @@ export class AgentActivity implements RecognitionHooks { } private interruptByAudioActivity(): void { + if (!this.isInterruptionByAudioActivityEnabled) { + return; + } + if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) { // skip speech handle interruption if server side turn detection is enabled return; @@ -706,6 +753,14 @@ export class AgentActivity implements RecognitionHooks { } } + onInterruption(ev: InterruptionEvent) { + this.restoreInterruptionByAudioActivity(); + this.interruptByAudioActivity(); + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(ev.overlapSpeechStartedAt || ev.timestamp); + } + } + onInterimTranscript(ev: SpeechEvent): void { if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) { // skip stt transcription if userTranscription is enabled on the realtime model @@ -1246,6 +1301,10 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(); + this.isInterruptionByAudioActivityEnabled = false; + } }; if (!audioOutput) { @@ -1315,6 +1374,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + this.restoreInterruptionByAudioActivity(); } } @@ -1445,6 +1508,10 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(); + this.isInterruptionByAudioActivityEnabled = false; + } }; let audioOut: _AudioOut | null = null; @@ -1568,6 +1635,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + this.restoreInterruptionByAudioActivity(); + } } this.logger.info( @@ -1602,6 +1673,12 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + this.restoreInterruptionByAudioActivity(); + } + } } // mark the playout done before waiting for the tool execution @@ -2375,6 +2452,55 @@ export class AgentActivity implements RecognitionHooks { unlock(); } } + + private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined { + const interruptionDetection = + this.agent.interruptionDetection ?? this.agentSession.interruptionDetection; + if ( + !( + this.stt && + this.stt.capabilities.alignedTranscript && + this.stt.capabilities.streaming && + this.vad && + this.turnDetection !== 'manual' && + this.turnDetection !== 'realtime_llm' && + !(this.llm instanceof RealtimeModel) + ) + ) { + if ( + typeof interruptionDetection === 'string' && + ['adaptive', 'vad'].includes(interruptionDetection) + ) { + this.logger.warn( + "interruption_detection is provided, but it's not compatible with the current configuration and will be disabled", + ); + return undefined; + } + } + + if ( + (interruptionDetection !== undefined && interruptionDetection === false) || + interruptionDetection === 'vad' + ) { + return undefined; + } + + const detector = new AdaptiveInterruptionDetector(); + + // TODO cleanup these listeners + detector.on('user_interruption_detected', (ev) => + this.agentSession.emit(AgentSessionEventTypes.UserInterruptionDetected, ev), + ); + detector.on('user_non_interruption_detected', (ev) => + this.agentSession.emit(AgentSessionEventTypes.UserNonInterruptionDetected, ev), + ); + + return detector; + } + + private restoreInterruptionByAudioActivity(): void { + this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled; + } } function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined { diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 29eae5a3f..20c989849 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -15,6 +15,7 @@ import { type STTModelString, type TTSModelString, } from '../inference/index.js'; +import type { InterruptionEvent } from '../inference/interruption/types.js'; import { type JobContext, getJobContext } from '../job.js'; import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; @@ -62,6 +63,9 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io import type { UnknownUserData } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; import { RunResult } from './testing/run_result.js'; +import type { InterruptionConfig } from './turn_config/interruption.js'; +import type { TurnHandlingConfig } from './turn_config/turnHandling.js'; +import { migrateLegacyOptions } from './turn_config/utils.js'; export interface VoiceOptions { allowInterruptions: boolean; @@ -75,17 +79,17 @@ export interface VoiceOptions { userAwayTimeout?: number | null; } -const defaultVoiceOptions: VoiceOptions = { - allowInterruptions: true, - discardAudioIfUninterruptible: true, - minInterruptionDuration: 500, - minInterruptionWords: 0, - minEndpointingDelay: 500, - maxEndpointingDelay: 6000, - maxToolSteps: 3, - preemptiveGeneration: false, - userAwayTimeout: 15.0, -} as const; +// const defaultVoiceOptions: VoiceOptions = { +// allowInterruptions: true, +// discardAudioIfUninterruptible: true, +// minInterruptionDuration: 500, +// minInterruptionWords: 0, +// minEndpointingDelay: 500, +// maxEndpointingDelay: 6000, +// maxToolSteps: 3, +// preemptiveGeneration: false, +// userAwayTimeout: 15.0, +// } as const; export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector; @@ -99,6 +103,8 @@ export type AgentSessionCallbacks = { [AgentSessionEventTypes.SpeechCreated]: (ev: SpeechCreatedEvent) => void; [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void; [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void; + [AgentSessionEventTypes.UserInterruptionDetected]: (ev: InterruptionEvent) => void; + [AgentSessionEventTypes.UserNonInterruptionDetected]: (ev: InterruptionEvent) => void; }; export type AgentSessionOptions = { @@ -110,6 +116,8 @@ export type AgentSessionOptions = { userData?: UserData; voiceOptions?: Partial; connOptions?: SessionConnectOptions; + turnHandling?: Partial; + maxToolSteps?: number; }; export class AgentSession< @@ -150,9 +158,10 @@ export class AgentSession< private ttsErrorCounts = 0; private sessionSpan?: Span; - private userSpeakingSpan?: Span; private agentSpeakingSpan?: Span; + private _interruptionDetection?: InterruptionConfig['mode']; + /** @internal */ _recorderIO?: RecorderIO; @@ -171,19 +180,15 @@ export class AgentSession< /** @internal - Current run state for testing */ _globalRunState?: RunResult; - constructor(opts: AgentSessionOptions) { + /** @internal */ + _userSpeakingSpan?: Span; + + constructor(options: AgentSessionOptions) { super(); - const { - vad, - stt, - llm, - tts, - turnDetection, - userData, - voiceOptions = defaultVoiceOptions, - connOptions, - } = opts; + const opts = migrateLegacyOptions(options); + + const { vad, stt, llm, tts, userData, connOptions, turnHandling } = opts; // Merge user-provided connOptions with defaults this._connOptions = { sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions }, @@ -214,7 +219,8 @@ export class AgentSession< this.tts = tts; } - this.turnDetection = turnDetection; + this.turnDetection = turnHandling?.turnDetection; + this._interruptionDetection = turnHandling?.interruption?.mode; this._userData = userData; // configurable IO @@ -223,7 +229,9 @@ export class AgentSession< // This is the "global" chat context, it holds the entire conversation history this._chatCtx = ChatContext.empty(); - this.options = { ...defaultVoiceOptions, ...voiceOptions }; + + // @ts-ignore FIXME the return type of the migration util has all defaults filled + this.options = opts.voiceOptions; this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this); this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed); @@ -263,6 +271,10 @@ export class AgentSession< return this._connOptions; } + get interruptionDetection() { + return this._interruptionDetection; + } + set userData(value: UserData) { this._userData = value; } @@ -724,8 +736,8 @@ export class AgentSession< return; } - if (state === 'speaking' && this.userSpeakingSpan === undefined) { - this.userSpeakingSpan = tracer.startSpan({ + if (state === 'speaking' && this._userSpeakingSpan === undefined) { + this._userSpeakingSpan = tracer.startSpan({ name: 'user_speaking', context: this.rootSpanContext, startTime: lastSpeakingTime, @@ -733,9 +745,9 @@ export class AgentSession< // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available // (Ref: Python agent_session.py line 1192-1195) - } else if (this.userSpeakingSpan !== undefined) { - this.userSpeakingSpan.end(lastSpeakingTime); - this.userSpeakingSpan = undefined; + } else if (this._userSpeakingSpan !== undefined) { + this._userSpeakingSpan.end(lastSpeakingTime); + this._userSpeakingSpan = undefined; } const oldState = this.userState; @@ -866,9 +878,9 @@ export class AgentSession< this.sessionSpan = undefined; } - if (this.userSpeakingSpan) { - this.userSpeakingSpan.end(); - this.userSpeakingSpan = undefined; + if (this._userSpeakingSpan) { + this._userSpeakingSpan.end(); + this._userSpeakingSpan = undefined; } if (this.agentSpeakingSpan) { diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 25d430684..805130ee1 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -5,14 +5,22 @@ import { AudioFrame } from '@livekit/rtc-node'; import type { Context, Span } from '@opentelemetry/api'; import type { WritableStreamDefaultWriter } from 'node:stream/web'; import { ReadableStream } from 'node:stream/web'; +import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; +import { InterruptionStreamSentinel } from '../inference/interruption/InterruptionStream.js'; +import { + type InterruptionEvent, + InterruptionEventType, + type InterruptionSentinel, +} from '../inference/interruption/types.js'; import { type ChatContext } from '../llm/chat_context.js'; import { log } from '../log.js'; import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js'; import { IdentityTransform } from '../stream/identity_transform.js'; import { mergeReadableStreams } from '../stream/merge_readable_streams.js'; +import { type StreamChannel, createStreamChannel } from '../stream/stream_channel.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { traceTypes, tracer } from '../telemetry/index.js'; -import { Task, delay } from '../utils.js'; +import { Task, delay, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; import type { STTNode } from './io.js'; @@ -32,6 +40,7 @@ export interface PreemptiveGenerationInfo { } export interface RecognitionHooks { + onInterruption: (ev: InterruptionEvent) => void; onStartOfSpeech: (ev: VADEvent) => void; onVADInferenceDone: (ev: VADEvent) => void; onEndOfSpeech: (ev: VADEvent) => void; @@ -54,18 +63,20 @@ export interface AudioRecognitionOptions { stt?: STTNode; vad?: VAD; turnDetector?: _TurnDetector; - turnDetectionMode?: Exclude; + turnDetectionMode?: TurnDetectionMode; + interruptionDetection?: AdaptiveInterruptionDetector; minEndpointingDelay: number; maxEndpointingDelay: number; rootSpanContext?: Context; } +// TODO add ability to update stt/vad/interruption-detection export class AudioRecognition { private hooks: RecognitionHooks; private stt?: STTNode; private vad?: VAD; private turnDetector?: _TurnDetector; - private turnDetectionMode?: Exclude; + private turnDetectionMode?: TurnDetectionMode; private minEndpointingDelay: number; private maxEndpointingDelay: number; private lastLanguage?: string; @@ -96,6 +107,16 @@ export class AudioRecognition { private commitUserTurnTask?: Task; private vadTask?: Task; private sttTask?: Task; + private interruptionTask?: Task; + + // interruption detection + private interruptionDetection?: AdaptiveInterruptionDetector; + private inputStartedAt?: number; + private ignoreUserTranscriptUntil?: number; + private transcriptBuffer: SpeechEvent[]; + private isInterruptionEnabled: boolean; + private isAgentSpeaking: boolean; + private interruptionStreamChannel: StreamChannel; constructor(opts: AudioRecognitionOptions) { this.hooks = opts.recognitionHooks; @@ -109,10 +130,18 @@ export class AudioRecognition { this.rootSpanContext = opts.rootSpanContext; this.deferredInputStream = new DeferredReadableStream(); - const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee(); + const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee(); + const [inputStream, sttInputStream] = teedInput.tee(); this.vadInputStream = vadInputStream; this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable); this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter(); + + this.interruptionDetection = opts.interruptionDetection; + this.transcriptBuffer = []; + this.isInterruptionEnabled = !!(opts.interruptionDetection || opts.vad); + this.isAgentSpeaking = false; + this.interruptionStreamChannel = createStreamChannel(); + this.interruptionStreamChannel.addStreamInput(inputStream); } /** @@ -135,6 +164,184 @@ export class AudioRecognition { this.sttTask.result.catch((err) => { this.logger.error(`Error running STT task: ${err}`); }); + + this.interruptionTask = Task.from(({ signal }) => + this.createInterruptionTask(this.interruptionDetection, signal), + ); + this.interruptionTask.result.catch((err) => { + this.logger.error(`Error running interruption task: ${err}`); + }); + } + + async stop() { + await this.sttTask?.cancelAndWait(); + await this.vadTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); + } + + async onStartOfAgentSpeech() { + this.isAgentSpeaking = true; + return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); + } + + async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (!this.isInterruptionEnabled) { + this.isAgentSpeaking = false; + return; + } + + const inputOpen = await this.trySendInterruptionSentinel( + InterruptionStreamSentinel.agentSpeechEnded(), + ); + if (!inputOpen) { + this.isAgentSpeaking = false; + return; + } + + if (this.isAgentSpeaking) { + if (this.ignoreUserTranscriptUntil === undefined) { + this.onEndOfOverlapSpeech(); + } + this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil + ? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil) + : ignoreUserTranscriptUntil; + + // flush held transcripts if possible + await this.flushHeldTranscripts(); + } + this.isAgentSpeaking = false; + } + + /** Start interruption inference when agent is speaking and overlap speech starts. */ + async onStartOfOverlapSpeech(speechDurationInS: number, userSpeakingSpan?: Span) { + if (this.isAgentSpeaking) { + this.trySendInterruptionSentinel( + InterruptionStreamSentinel.overlapSpeechStarted(speechDurationInS, userSpeakingSpan), + ); + } + } + + async onEndOfOverlapSpeech(userSpeakingSpan?: Span) { + if (!this.isInterruptionEnabled) { + return; + } + if (userSpeakingSpan && userSpeakingSpan.isRecording()) { + userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, 'false'); + } + + return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded()); + } + + /** + * Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp. + * If the event has no timestamps, we assume it is the same as the next valid event. + */ + private async flushHeldTranscripts() { + if ( + !this.isInterruptionEnabled || + this.ignoreUserTranscriptUntil === undefined || + this.transcriptBuffer.length === 0 + ) { + return; + } + + if (!this.inputStartedAt) { + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + return; + } + + let emitFromIndex: number | null = null; + let shouldFlush = false; + + for (let i = 0; i < this.transcriptBuffer.length; i++) { + const ev = this.transcriptBuffer[i]; + if (!ev || !ev.alternatives || ev.alternatives.length === 0) { + emitFromIndex = Math.min(emitFromIndex ?? i, i); + continue; + } + const firstAlternative = ev.alternatives[0]; + if ( + firstAlternative.startTime === firstAlternative.endTime && + firstAlternative.startTime === 0 + ) { + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + return; + } + + if ( + firstAlternative.endTime > 0 && + firstAlternative.endTime + this.inputStartedAt < this.ignoreUserTranscriptUntil + ) { + emitFromIndex = null; + } else { + emitFromIndex = Math.min(emitFromIndex ?? i, i); + shouldFlush = true; + break; + } + } + + const eventsToEmit = + emitFromIndex && shouldFlush !== undefined ? this.transcriptBuffer.slice(emitFromIndex) : []; + + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + + for (const event of eventsToEmit) { + this.logger.trace( + { + event: event.type, + }, + 're-emitting held user transcript', + ); + this.onSTTEvent(event); + } + } + + private shouldHoldSttEvent(ev: SpeechEvent): boolean { + if (!this.isInterruptionEnabled) { + return false; + } + if (this.isAgentSpeaking) { + return true; + } + + if (this.ignoreUserTranscriptUntil === undefined) { + return false; + } + // sentinel events are always held until we have something concrete to release them + if (!ev.alternatives || ev.alternatives.length === 0) { + return true; + } + + const alternative = ev.alternatives[0]; + + if ( + this.inputStartedAt && + alternative.startTime !== alternative.endTime && + alternative.endTime > 0 && + alternative.endTime + this.inputStartedAt < this.ignoreUserTranscriptUntil + ) { + return true; + } + return false; + } + + private async trySendInterruptionSentinel( + frame: AudioFrame | InterruptionSentinel, + ): Promise { + if (this.isInterruptionEnabled && !this.interruptionStreamChannel.closed) { + try { + await this.interruptionStreamChannel.write(frame); + return true; + } catch (e: unknown) { + this.logger.warn( + `could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`, + ); + } + } + return false; } private async onSTTEvent(ev: SpeechEvent) { @@ -159,6 +366,25 @@ export class AudioRecognition { return; } + // handle interruption detection + // - hold the event until the ignore_user_transcript_until expires + // - release only relevant events + // - allow RECOGNITION_USAGE to pass through immediately + + if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) { + if (this.shouldHoldSttEvent(ev)) { + this.logger.trace( + { event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil }, + 'holding STT event until ignore_user_transcript_until expires', + ); + this.transcriptBuffer.push(ev); + return; + } else { + await this.flushHeldTranscripts(); + // no return here to allow the new event to be processed normally + } + } + switch (ev.type) { case SpeechEventType.FINAL_TRANSCRIPT: this.hooks.onFinalTranscript(ev); @@ -329,6 +555,12 @@ export class AudioRecognition { } } + private onInterruptionEvent(ev: InterruptionEvent) { + if (ev.type === InterruptionEventType.INTERRUPTION) { + this.hooks.onInterruption(ev); + } + } + private runEOUDetection(chatCtx: ChatContext) { this.logger.debug( { @@ -616,6 +848,71 @@ export class AudioRecognition { } } + private async createInterruptionTask( + interruptionDetection: AdaptiveInterruptionDetector | undefined, + signal: AbortSignal, + ) { + if (!interruptionDetection) return; + + const stream = interruptionDetection.createStream(); + const inputReader = this.interruptionStreamChannel.stream().getReader(); + + const cleanup = async () => { + try { + signal.removeEventListener('abort', abortHandler); + eventReader.releaseLock(); + await stream.close(); + } catch (e) { + this.logger.debug('createInterruptionTask: error during abort handler:', e); + } + }; + + // Forward input frames/sentinels to the interruption stream + const forwardTask = (async () => { + try { + const abortPromise = waitForAbort(signal); + while (!signal.aborted) { + const res = await Promise.race([inputReader.read(), abortPromise]); + if (!res) break; + const { value, done } = res; + if (done) break; + await stream.pushFrame(value); + } + } finally { + inputReader.releaseLock(); + } + })(); + + // Read output events from the interruption stream + const eventReader = stream.stream().getReader(); + const abortHandler = async () => { + await cleanup(); + }; + signal.addEventListener('abort', abortHandler); + + try { + const abortPromise = waitForAbort(signal); + + while (!signal.aborted) { + this.logger.warn('waiting for interruption event'); + const res = await Promise.race([eventReader.read(), abortPromise]); + if (!res) break; + const { done, value: ev } = res; + if (done) break; + this.logger.warn('got interruption event'); + this.onInterruptionEvent(ev); + } + } catch (e) { + if (!signal.aborted) { + this.logger.error(e, 'Error in interruption task'); + } + } finally { + await cleanup(); + await forwardTask; + this.logger.debug('Interruption task closed'); + } + } + setInputAudioStream(audioStream: ReadableStream) { this.deferredInputStream.setSource(audioStream); } @@ -688,6 +985,7 @@ export class AudioRecognition { await this.sttTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); + await this.interruptionStreamChannel.close(); } private _endUserTurnSpan({ @@ -714,6 +1012,12 @@ export class AudioRecognition { } private get vadBaseTurnDetection() { - return ['vad', undefined].includes(this.turnDetectionMode); + if (typeof this.turnDetectionMode === 'object') { + return false; + } + + if (this.turnDetectionMode === undefined || this.turnDetectionMode === 'vad') { + return true; + } } } diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts index 7d8ff325f..b184ff85a 100644 --- a/agents/src/voice/events.ts +++ b/agents/src/voice/events.ts @@ -25,6 +25,8 @@ export enum AgentSessionEventTypes { FunctionToolsExecuted = 'function_tools_executed', MetricsCollected = 'metrics_collected', SpeechCreated = 'speech_created', + UserInterruptionDetected = 'user_interruption_detected', + UserNonInterruptionDetected = 'user_non_interruption_detected', Error = 'error', Close = 'close', } diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index a4773df1a..7a65500a5 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -1,16 +1,12 @@ import type { AgentSessionOptions } from '../agent_session.js'; -import { - type TurnHandlingConfig, - defaultEndpointingConfig, - defaultInterruptionConfig, - defaultTurnHandlingConfig, -} from './index.js'; +import { defaultEndpointingConfig } from './endpointing.js'; +import { defaultInterruptionConfig } from './interruption.js'; +import { type TurnHandlingConfig, defaultTurnHandlingConfig } from './turnHandling.js'; -export function migrateLegacyOptions( - legacyOptions: AgentSessionOptions, -): Omit { +export function migrateLegacyOptions( + legacyOptions: AgentSessionOptions, +): AgentSessionOptions { const { voiceOptions, turnDetection, ...rest } = legacyOptions; - const newAgentSessionOptions = rest; const turnHandling: TurnHandlingConfig = { turnDetection: turnDetection ?? defaultTurnHandlingConfig.turnDetection, interruption: { @@ -29,8 +25,12 @@ export function migrateLegacyOptions( userAwayTimeout: voiceOptions?.userAwayTimeout ?? defaultTurnHandlingConfig.userAwayTimeout, preemptiveGeneration: voiceOptions?.preemptiveGeneration ?? defaultTurnHandlingConfig.preemptiveGeneration, + + ...rest.turnHandling, }; + const newAgentSessionOptions: AgentSessionOptions = { ...rest, turnHandling }; + if (voiceOptions?.allowInterruptions === false) { turnHandling.interruption.mode = false; } @@ -39,5 +39,16 @@ export function migrateLegacyOptions( if (voiceOptions?.maxToolSteps) { newAgentSessionOptions.maxToolSteps = voiceOptions.maxToolSteps; } + + newAgentSessionOptions.voiceOptions = { + maxToolSteps: newAgentSessionOptions.maxToolSteps, + maxEndpointingDelay: turnHandling.endpointing.maxDelay, + minEndpointingDelay: turnHandling.endpointing.minDelay, + minInterruptionDuration: turnHandling.interruption.minDuration, + minInterruptionWords: turnHandling.interruption.minWords, + allowInterruptions: turnHandling.interruption.mode !== false, + discardAudioIfUninterruptible: turnHandling.interruption.discardAudioIfUninterruptible, + userAwayTimeout: turnHandling.userAwayTimeout, + }; return newAgentSessionOptions; } diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index 91e549bff..5fa94153e 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -8,6 +8,7 @@ import { cli, defineAgent, llm, + log, metrics, voice, } from '@livekit/agents'; @@ -39,6 +40,8 @@ export default defineAgent({ }, }); + const logger = log(); + const session = new voice.AgentSession({ // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand // See all available models at https://docs.livekit.io/agents/models/stt/ @@ -55,7 +58,16 @@ export default defineAgent({ // VAD and turn detection are used to determine when the user is speaking and when the agent should respond // See more at https://docs.livekit.io/agents/build/turns vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), + + turnHandling: { + turnDetection: new livekit.turnDetector.MultilingualModel(), + interruption: { + resumeFalseInterruption: true, + falseInterruptionTimeout: 1, + mode: 'adaptive', + }, + preemptiveGeneration: true, + }, // to use realtime model, replace the stt, llm, tts and vad with the following // llm: new openai.realtime.RealtimeModel(), voiceOptions: { @@ -79,6 +91,14 @@ export default defineAgent({ usageCollector.collect(ev.metrics); }); + session.on(voice.AgentSessionEventTypes.UserInterruptionDetected, (ev) => { + logger.warn({ type: ev.type }, 'interruption detected'); + }); + + session.on(voice.AgentSessionEventTypes.UserNonInterruptionDetected, (ev) => { + logger.warn({ type: ev.type }, 'non interruption detected'); + }); + await session.start({ agent, room: ctx.room, From 1862dc33ef2fe6a22fb668a53bf394da2db98edf Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 29 Jan 2026 10:52:26 +0100 Subject: [PATCH 04/26] remove aic --- examples/package.json | 1 - examples/src/basic_agent.ts | 7 +- pnpm-lock.yaml | 333 +----------------------------------- 3 files changed, 3 insertions(+), 338 deletions(-) diff --git a/examples/package.json b/examples/package.json index c9db0d91e..858259219 100644 --- a/examples/package.json +++ b/examples/package.json @@ -40,7 +40,6 @@ "@livekit/agents-plugin-silero": "workspace:*", "@livekit/agents-plugin-xai": "workspace:*", "@livekit/noise-cancellation-node": "^0.1.9", - "@livekit/plugins-ai-coustics": "0.1.7", "@livekit/rtc-node": "catalog:", "@opentelemetry/api": "^1.9.0", "@opentelemetry/api-logs": "^0.54.0", diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index 5fa94153e..8c7c36826 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -14,8 +14,7 @@ import { } from '@livekit/agents'; import * as livekit from '@livekit/agents-plugin-livekit'; import * as silero from '@livekit/agents-plugin-silero'; -// import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; -import * as aic from '@livekit/plugins-ai-coustics'; +import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -103,9 +102,7 @@ export default defineAgent({ agent, room: ctx.room, inputOptions: { - noiseCancellation: aic.audioEnhancement(), - // or for krisp use - // noiseCancellation: BackgroundVoiceCancellation(), + noiseCancellation: BackgroundVoiceCancellation(), }, }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index aea4b40ed..a7b882c2c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -278,9 +278,6 @@ importers: '@livekit/noise-cancellation-node': specifier: ^0.1.9 version: 0.1.9 - '@livekit/plugins-ai-coustics': - specifier: 0.1.7 - version: 0.1.7(@livekit/rtc-node@0.13.24) '@livekit/rtc-node': specifier: 'catalog:' version: 0.13.24 @@ -1933,12 +1930,6 @@ packages: cpu: [x64] os: [win32] - '@livekit/plugins-ai-coustics@0.1.7': - resolution: {integrity: sha512-jScAdBttVdazsXvzK8v9lQdcBNZGCNM67kldtdpuXdGaT2X+aLqz4dTwRqnSSio99GfobGz/MMZ5H+3KLdy/9A==} - engines: {node: '>= 18'} - peerDependencies: - '@livekit/rtc-node': '*' - '@livekit/protocol@1.43.0': resolution: {integrity: sha512-WCJ97fa4CBqPDh8pzdszOm/2xmelJ3Dx2vjKBlyb9BzmPQx1LjzVciP6uYFFMCMdrq2l1mjFQBXEz8Z20UCkyw==} @@ -2647,9 +2638,6 @@ packages: '@types/tapable@1.0.6': resolution: {integrity: sha512-W+bw9ds02rAQaMvaLYxAbJ6cvguW/iJXNT6lTssS1ps6QdrMKttqEAMEG/b5CR8TZl3/L7/lH0ZV5nNR1LXikA==} - '@types/unzipper@0.10.11': - resolution: {integrity: sha512-D25im2zjyMCcgL9ag6N46+wbtJBnXIr7SI4zHf9eJD2Dw2tEB5e+p5MYkrxKIVRscs5QV0EhtU9rgXSPx90oJg==} - '@types/ws@8.5.10': resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==} @@ -2796,72 +2784,6 @@ packages: '@vitest/utils@4.0.17': resolution: {integrity: sha512-RG6iy+IzQpa9SB8HAFHJ9Y+pTzI+h8553MrciN9eC6TFBErqrQaTas4vG+MVj8S4uKk8uTT2p0vgZPnTdxd96w==} - '@yuuang/ffi-rs-android-arm64@1.3.1': - resolution: {integrity: sha512-V4nmlXdOYZEa7GOxSExVG95SLp8FE0iTq2yKeN54UlfNMr3Sik+1Ff57LcCv7qYcn4TBqnBAt5rT3FAM6T6caQ==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [android] - - '@yuuang/ffi-rs-darwin-arm64@1.3.1': - resolution: {integrity: sha512-YlnTMIyzfW3mAULC5ZA774nzQfFlYXM0rrfq/8ZzWt+IMbYk55a++jrI+6JeKV+1EqlDS3TFBEFtjdBNG94KzQ==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [darwin] - - '@yuuang/ffi-rs-darwin-x64@1.3.1': - resolution: {integrity: sha512-sI3LpQQ34SX4nyOHc5yxA7FSqs9qPEUMqW/y/wWo9cuyPpaHMFsi/BeOVYsnC0syp3FrY7gzn6RnD6PlXCktXg==} - engines: {node: '>= 12'} - cpu: [x64] - os: [darwin] - - '@yuuang/ffi-rs-linux-arm-gnueabihf@1.3.1': - resolution: {integrity: sha512-1WkcGkJTlwh4ZA59htKI+RXhiL3oKiYwLv7PO8LUf6FuADK73s5GcXp67iakKu243uYu+qGYr4RHco4ySddYhQ==} - engines: {node: '>= 12'} - cpu: [arm] - os: [linux] - - '@yuuang/ffi-rs-linux-arm64-gnu@1.3.1': - resolution: {integrity: sha512-J2PwqviycZxaEVA0Bwv38LqGDGSB9A1DPN4iYginYJZSvTvKW8kh7Tis0HbZrX1YDKnY8hi3lt0N0tCTNPDH5Q==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [linux] - - '@yuuang/ffi-rs-linux-arm64-musl@1.3.1': - resolution: {integrity: sha512-Hn1W1hBPssTaqikU1Bqp1XUdDdOgbnYVIOtR++LVx66hhrtjf/xrIUQOhTm+NmOFDG16JUKXe1skfM4gpaqYwg==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [linux] - - '@yuuang/ffi-rs-linux-x64-gnu@1.3.1': - resolution: {integrity: sha512-kW6e+oCYZPvpH2ppPsffA18e1aLowtmWTRjVlyHtY04g/nQDepQvDUkkcvInh9fW5jLna7PjHvktW1tVgYIj2A==} - engines: {node: '>= 12'} - cpu: [x64] - os: [linux] - - '@yuuang/ffi-rs-linux-x64-musl@1.3.1': - resolution: {integrity: sha512-HTwblAzruUS16nQPrez3ozvEHm1Xxh8J8w7rZYrpmAcNl1hzyOT8z/hY70M9Rt9fOqQ4Ovgor9qVy/U3ZJo0ZA==} - engines: {node: '>= 12'} - cpu: [x64] - os: [linux] - - '@yuuang/ffi-rs-win32-arm64-msvc@1.3.1': - resolution: {integrity: sha512-WeZkGl2BP1U4tRhEQH+FXLQS52N8obp74smK5AAGOfzPAT1pHkq6+dVkC1QCSIt7dHJs7SPtlnQw+5DkdZYlWA==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [win32] - - '@yuuang/ffi-rs-win32-ia32-msvc@1.3.1': - resolution: {integrity: sha512-rNGgMeCH5mdeHiMiJgt7wWXovZ+FHEfXhU9p4zZBH4n8M1/QnEsRUwlapISPLpILSGpoYS6iBuq9/fUlZY8Mhg==} - engines: {node: '>= 12'} - cpu: [x64, ia32] - os: [win32] - - '@yuuang/ffi-rs-win32-x64-msvc@1.3.1': - resolution: {integrity: sha512-dr2LcLD2CXo2a7BktlOpV68QhayqiI112KxIJC9tBgQO/Dkdg4CPsdqmvzzLhFo64iC5RLl2BT7M5lJImrfUWw==} - engines: {node: '>= 12'} - cpu: [x64] - os: [win32] - abort-controller@3.0.0: resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==} engines: {node: '>=6.5'} @@ -3022,19 +2944,9 @@ packages: resolution: {integrity: sha512-pbnl5XzGBdrFU/wT4jqmJVPn2B6UHPBOhzMQkY/SPUPB6QtUXtmBHBIwCbXJol93mOpGMnQyP/+BB19q04xj7g==} engines: {node: '>=4'} - big-integer@1.6.52: - resolution: {integrity: sha512-QxD8cf2eVqJOOz63z6JIN9BzvVs/dlySa5HGSBH5xtR8dPteIRQnBxxKqkNTiT6jbDTF6jAfrd4oMcND9RGbQg==} - engines: {node: '>=0.6'} - bignumber.js@9.3.1: resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==} - binary@0.3.0: - resolution: {integrity: sha512-D4H1y5KYwpJgK8wk1Cue5LLPgmwHKYSChkbspQg5JtVuR5ulGckxfR62H3AE9UDkdMC8yyXlqYihuz3Aqg2XZg==} - - bluebird@3.4.7: - resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} - boolean@3.2.0: resolution: {integrity: sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==} deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. @@ -3056,17 +2968,9 @@ packages: buffer-equal-constant-time@1.0.1: resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==} - buffer-indexof-polyfill@1.0.2: - resolution: {integrity: sha512-I7wzHwA3t1/lwXQh+A5PbNvJxgfo5r3xulgpYDB5zckTu/Z9oUK9biouBKQUjEqzaz3HnAT6TYoovmE+GqSf7A==} - engines: {node: '>=0.10'} - buffer@6.0.3: resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==} - buffers@0.1.1: - resolution: {integrity: sha512-9q/rDEGSb/Qsvv2qvzIzdluL5k7AaJOTrw23z9reQthrbF7is4CtlT0DXyO1oei2DCp4uojjzQ7igaSHp1kAEQ==} - engines: {node: '>=0.2.0'} - builtin-modules@3.3.0: resolution: {integrity: sha512-zhaCDicdLuWN5UbN5IMnFqNMhNfo919sH85y2/ea+5Yg9TsTkeZxpL+JLbp6cgYFS4sRLp3YV4S6yDuqVWHYOw==} engines: {node: '>=6'} @@ -3116,9 +3020,6 @@ packages: resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==} engines: {node: '>=18'} - chainsaw@0.1.0: - resolution: {integrity: sha512-75kWfWt6MEKNC8xYXIdRpDehRYY/tNSgwKaJq+dbbDcxORuVrrQ+SEHoWsniVn9XPYfP4gmdWIeDk/4YNp1rNQ==} - chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -3194,9 +3095,6 @@ packages: resolution: {integrity: sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==} engines: {node: ^14.18.0 || >=16.10.0} - core-util-is@1.0.3: - resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} - cross-spawn@7.0.3: resolution: {integrity: sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==} engines: {node: '>= 8'} @@ -3338,9 +3236,6 @@ packages: resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==} engines: {node: '>= 0.4'} - duplexer2@0.1.4: - resolution: {integrity: sha512-asLFVfWWtJ90ZyOUHMqk7/S2w2guQKxUI2itj3d92ADHhxUSbCMGi1f1cBcJ7xM1To+pE/Khbwo1yuNbMEPKeA==} - eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} @@ -3707,9 +3602,6 @@ packages: resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==} engines: {node: ^12.20 || >= 14.13} - ffi-rs@1.3.1: - resolution: {integrity: sha512-ZyNXL9fnclnZV+waQmWB9JrfbIEyxQa1OWtMrHOrAgcC04PgP5hBMG5TdhVN8N4uT/eul8zCFMVnJUukAFFlXA==} - file-entry-cache@6.0.1: resolution: {integrity: sha512-7Gps/XWymbLk2QLYK4NzpMOrYjMhdIxXuIvy2QBsLE6ljuodKvdkWs/cpyJJ3CVIVpH0Oi1Hvg1ovbMzLdFBBg==} engines: {node: ^10.12.0 || >=12.0.0} @@ -3776,11 +3668,6 @@ packages: engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] - fstream@1.0.12: - resolution: {integrity: sha512-WvJ193OHa0GHPEL+AycEJgxvBEwyfRkN1vhjca23OaPVMCaLCXTd5qAu82AjTcgP1UJmytkOKb63Ypde7raDIg==} - engines: {node: '>=0.6'} - deprecated: This package is no longer supported. - function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} @@ -3822,9 +3709,6 @@ packages: resolution: {integrity: sha512-g0QYk1dZBxGwk+Ngc+ltRH2IBp2f7zBkBMBJZCDerh6EhlhSR6+9irMCuT/09zD6qkarHUSn529sK/yL4S27mg==} engines: {node: '>= 0.4'} - get-symbol-from-current-process-h@1.0.2: - resolution: {integrity: sha512-syloC6fsCt62ELLrr1VKBM1ggOpMdetX9hTrdW77UQdcApPHLmf7CI7OKcN1c9kYuNxKcDe4iJ4FY9sX3aw2xw==} - get-tsconfig@4.7.5: resolution: {integrity: sha512-ZCuZCnlqNzjb4QprAzXKdpp/gh6KTxSJuw3IBsPnV/7fV4NxC9ckB+vPTt8w7fJA0TaSD7c55BR47JD6MEDyDw==} @@ -4121,9 +4005,6 @@ packages: resolution: {integrity: sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==} engines: {node: '>=0.10.0'} - isarray@1.0.0: - resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} - isarray@2.0.5: resolution: {integrity: sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==} @@ -4250,9 +4131,6 @@ packages: lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} - listenercount@1.0.1: - resolution: {integrity: sha512-3mk/Zag0+IJxeDrxSgaDPy4zZ3w05PRZeJNnlWhzFz5OkX49J4krc+A8X2d2M69vGMBEX0uyl8M+W+8gH+kBqQ==} - livekit-server-sdk@2.13.3: resolution: {integrity: sha512-ItSQ2gE1oz/Ev9mfBRdAw+P05rt/BaYRkldggKz0+3rh/Yt0ag0BLID3VrgCVFVRAQ2YEJKcJJyj5p4epIJ8QA==} engines: {node: '>=18'} @@ -4410,10 +4288,6 @@ packages: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} - mkdirp@0.5.6: - resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} - hasBin: true - mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -4451,9 +4325,6 @@ packages: natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} - node-addon-api@3.2.1: - resolution: {integrity: sha512-mmcei9JghVNDYydghQmeDX8KoAm0FAiYyIcUt/N4nhyAipB17pllZQDOJD2fotxABnt4Mdz+dKTO7eftLg4d0A==} - node-domexception@1.0.0: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} @@ -4475,10 +4346,6 @@ packages: resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} - node-gyp-build@4.8.4: - resolution: {integrity: sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==} - hasBin: true - npm-run-path@5.3.0: resolution: {integrity: sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -4765,9 +4632,6 @@ packages: resolution: {integrity: sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - process-nextick-args@2.0.1: - resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} - process-warning@3.0.0: resolution: {integrity: sha512-mqn0kFRl0EoqhnL0GQ0veqFHyIN1yig9RHh/InzORTUiZHFRAur+aMtRkELNwGs9aNwKS6tg/An4NYBPGwvtzQ==} @@ -4815,9 +4679,6 @@ packages: resolution: {integrity: sha512-VIMnQi/Z4HT2Fxuwg5KrY174U1VdUIASQVWXXyqtNRtxSr9IYkn1rsI6Tb6HsrHCmB7gVpNwX6JxPTHcH6IoTA==} engines: {node: '>=6'} - readable-stream@2.3.8: - resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} - readable-stream@4.5.2: resolution: {integrity: sha512-yjavECdqeZ3GLXNgRXgeQEdz9fvDDkNKyHnbHRFtOr7/LcfgBcmct7t/ET+HaCTqfh06OzoAxrkN/IfjJBVe+g==} engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} @@ -4830,10 +4691,6 @@ packages: resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==} engines: {node: '>= 12.13.0'} - ref-napi@3.0.3: - resolution: {integrity: sha512-LiMq/XDGcgodTYOMppikEtJelWsKQERbLQsYm0IOOnzhwE9xYZC7x8txNnFC9wJNOkPferQI4vD4ZkC0mDyrOA==} - engines: {node: '>= 10.0'} - reflect.getprototypeof@1.0.6: resolution: {integrity: sha512-fmfw4XgoDke3kdI6h4xcUz1dG8uaiv5q9gcEwLS4Pnth2kxT+GZ7YehS1JTMGBQmtV7Y4GFGbs2re2NqhdozUg==} engines: {node: '>= 0.4'} @@ -4875,11 +4732,6 @@ packages: resolution: {integrity: sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} - rimraf@2.7.1: - resolution: {integrity: sha512-uWjbaKIK3T1OSVptzX7Nl6PvQ3qAGtKEtVRjRuazjfL3Bx5eI409VZSqgND+4UNnmzLVdPj9FqFJNPqBZFve4w==} - deprecated: Rimraf versions prior to v4 are no longer supported - hasBin: true - rimraf@3.0.2: resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} deprecated: Rimraf versions prior to v4 are no longer supported @@ -4915,9 +4767,6 @@ packages: resolution: {integrity: sha512-vj6RsCsWBCf19jIeHEfkRMw8DPiBb+DMXklQ/1SGDHOMlHdPUkZXFQ2YdplS23zESTijAcurb1aSgJA3AgMu1Q==} engines: {node: '>=0.4'} - safe-buffer@5.1.2: - resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} - safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} @@ -4979,9 +4828,6 @@ packages: resolution: {integrity: sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==} engines: {node: '>= 0.4'} - setimmediate@1.0.5: - resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==} - sharp@0.34.5: resolution: {integrity: sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} @@ -5094,9 +4940,6 @@ packages: resolution: {integrity: sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==} engines: {node: '>= 0.4'} - string_decoder@1.1.1: - resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==} - string_decoder@1.3.0: resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==} @@ -5243,9 +5086,6 @@ packages: tr46@1.0.1: resolution: {integrity: sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==} - traverse@0.3.9: - resolution: {integrity: sha512-iawgk0hLP3SxGKDfnDJf8wTz4p2qImnyihM5Hh/sGvQ3K37dPi/w8sRhdNIxYA1TwFwc5mDhIJq+O0RsvXBKdQ==} - tree-kill@1.2.2: resolution: {integrity: sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==} hasBin: true @@ -5406,23 +5246,13 @@ packages: undici-types@6.21.0: resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} - uniffi-bindgen-react-native@0.29.3-1: - resolution: {integrity: sha512-o6gXZsAh55yuvhwF2WSFdIHV4phyfWcCmg4DuyfJWJ7CvUz1UcIz2S4u9SmXAz1jsuqvu6Xc9hexrRBB0a5osg==} - hasBin: true - universalify@0.1.2: resolution: {integrity: sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==} engines: {node: '>= 4.0.0'} - unzipper@0.10.11: - resolution: {integrity: sha512-+BrAq2oFqWod5IESRjL3S8baohbevGcVA+teAIOYWM3pDVdseogqbzhhvvmiyQrUNKFUnDMtELW3X8ykbyDCJw==} - uri-js@4.4.1: resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==} - util-deprecate@1.0.2: - resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} - uuid@11.1.0: resolution: {integrity: sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==} hasBin: true @@ -6559,21 +6389,6 @@ snapshots: '@livekit/noise-cancellation-win32-x64@0.1.9': optional: true - '@livekit/plugins-ai-coustics@0.1.7(@livekit/rtc-node@0.13.24)': - dependencies: - '@livekit/rtc-node': 0.13.24 - '@types/unzipper': 0.10.11 - ffi-rs: 1.3.1 - node-fetch: 3.3.2 - pino: 9.6.0 - pino-pretty: 13.0.0 - ref-napi: 3.0.3 - tsx: 4.21.0 - uniffi-bindgen-react-native: 0.29.3-1 - unzipper: 0.10.11 - transitivePeerDependencies: - - supports-color - '@livekit/protocol@1.43.0': dependencies: '@bufbuild/protobuf': 1.10.1 @@ -7278,10 +7093,6 @@ snapshots: '@types/tapable@1.0.6': {} - '@types/unzipper@0.10.11': - dependencies: - '@types/node': 22.19.1 - '@types/ws@8.5.10': dependencies: '@types/node': 22.19.1 @@ -7497,39 +7308,6 @@ snapshots: '@vitest/pretty-format': 4.0.17 tinyrainbow: 3.0.3 - '@yuuang/ffi-rs-android-arm64@1.3.1': - optional: true - - '@yuuang/ffi-rs-darwin-arm64@1.3.1': - optional: true - - '@yuuang/ffi-rs-darwin-x64@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-arm-gnueabihf@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-arm64-gnu@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-arm64-musl@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-x64-gnu@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-x64-musl@1.3.1': - optional: true - - '@yuuang/ffi-rs-win32-arm64-msvc@1.3.1': - optional: true - - '@yuuang/ffi-rs-win32-ia32-msvc@1.3.1': - optional: true - - '@yuuang/ffi-rs-win32-x64-msvc@1.3.1': - optional: true - abort-controller@3.0.0: dependencies: event-target-shim: 5.0.1 @@ -7699,17 +7477,8 @@ snapshots: dependencies: is-windows: 1.0.2 - big-integer@1.6.52: {} - bignumber.js@9.3.1: {} - binary@0.3.0: - dependencies: - buffers: 0.1.1 - chainsaw: 0.1.0 - - bluebird@3.4.7: {} - boolean@3.2.0: {} brace-expansion@1.1.11: @@ -7731,15 +7500,11 @@ snapshots: buffer-equal-constant-time@1.0.1: {} - buffer-indexof-polyfill@1.0.2: {} - buffer@6.0.3: dependencies: base64-js: 1.5.1 ieee754: 1.2.1 - buffers@0.1.1: {} - builtin-modules@3.3.0: {} builtins@5.1.0: @@ -7797,10 +7562,6 @@ snapshots: chai@6.2.2: {} - chainsaw@0.1.0: - dependencies: - traverse: 0.3.9 - chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7861,8 +7622,6 @@ snapshots: consola@3.4.2: {} - core-util-is@1.0.3: {} - cross-spawn@7.0.3: dependencies: path-key: 3.1.1 @@ -7975,10 +7734,6 @@ snapshots: es-errors: 1.3.0 gopd: 1.2.0 - duplexer2@0.1.4: - dependencies: - readable-stream: 2.3.8 - eastasianwidth@0.2.0: {} ecdsa-sig-formatter@1.0.11: @@ -8560,20 +8315,6 @@ snapshots: node-domexception: 1.0.0 web-streams-polyfill: 3.3.3 - ffi-rs@1.3.1: - optionalDependencies: - '@yuuang/ffi-rs-android-arm64': 1.3.1 - '@yuuang/ffi-rs-darwin-arm64': 1.3.1 - '@yuuang/ffi-rs-darwin-x64': 1.3.1 - '@yuuang/ffi-rs-linux-arm-gnueabihf': 1.3.1 - '@yuuang/ffi-rs-linux-arm64-gnu': 1.3.1 - '@yuuang/ffi-rs-linux-arm64-musl': 1.3.1 - '@yuuang/ffi-rs-linux-x64-gnu': 1.3.1 - '@yuuang/ffi-rs-linux-x64-musl': 1.3.1 - '@yuuang/ffi-rs-win32-arm64-msvc': 1.3.1 - '@yuuang/ffi-rs-win32-ia32-msvc': 1.3.1 - '@yuuang/ffi-rs-win32-x64-msvc': 1.3.1 - file-entry-cache@6.0.1: dependencies: flat-cache: 3.2.0 @@ -8649,13 +8390,6 @@ snapshots: fsevents@2.3.3: optional: true - fstream@1.0.12: - dependencies: - graceful-fs: 4.2.11 - inherits: 2.0.4 - mkdirp: 0.5.6 - rimraf: 2.7.1 - function-bind@1.1.2: {} function.prototype.name@1.1.6: @@ -8720,8 +8454,6 @@ snapshots: es-errors: 1.3.0 get-intrinsic: 1.2.4 - get-symbol-from-current-process-h@1.0.2: {} - get-tsconfig@4.7.5: dependencies: resolve-pkg-maps: 1.0.0 @@ -9012,8 +8744,6 @@ snapshots: is-windows@1.0.2: {} - isarray@1.0.0: {} - isarray@2.0.5: {} isexe@2.0.0: {} @@ -9139,8 +8869,6 @@ snapshots: lines-and-columns@1.2.4: {} - listenercount@1.0.1: {} - livekit-server-sdk@2.13.3: dependencies: '@bufbuild/protobuf': 1.10.1 @@ -9283,10 +9011,6 @@ snapshots: minipass: 7.1.2 rimraf: 5.0.10 - mkdirp@0.5.6: - dependencies: - minimist: 1.2.8 - mkdirp@3.0.1: {} mlly@1.7.0: @@ -9316,8 +9040,6 @@ snapshots: natural-compare@1.4.0: {} - node-addon-api@3.2.1: {} - node-domexception@1.0.0: {} node-fetch-native@1.6.7: {} @@ -9332,8 +9054,6 @@ snapshots: fetch-blob: 3.2.0 formdata-polyfill: 4.0.10 - node-gyp-build@4.8.4: {} - npm-run-path@5.3.0: dependencies: path-key: 4.0.0 @@ -9640,8 +9360,6 @@ snapshots: ansi-styles: 5.2.0 react-is: 18.3.1 - process-nextick-args@2.0.1: {} - process-warning@3.0.0: {} process-warning@4.0.1: {} @@ -9695,16 +9413,6 @@ snapshots: pify: 4.0.1 strip-bom: 3.0.0 - readable-stream@2.3.8: - dependencies: - core-util-is: 1.0.3 - inherits: 2.0.4 - isarray: 1.0.0 - process-nextick-args: 2.0.1 - safe-buffer: 5.1.2 - string_decoder: 1.1.1 - util-deprecate: 1.0.2 - readable-stream@4.5.2: dependencies: abort-controller: 3.0.0 @@ -9717,15 +9425,6 @@ snapshots: real-require@0.2.0: {} - ref-napi@3.0.3: - dependencies: - debug: 4.4.1 - get-symbol-from-current-process-h: 1.0.2 - node-addon-api: 3.2.1 - node-gyp-build: 4.8.4 - transitivePeerDependencies: - - supports-color - reflect.getprototypeof@1.0.6: dependencies: call-bind: 1.0.7 @@ -9778,10 +9477,6 @@ snapshots: reusify@1.0.4: {} - rimraf@2.7.1: - dependencies: - glob: 7.2.3 - rimraf@3.0.2: dependencies: glob: 7.2.3 @@ -9889,8 +9584,6 @@ snapshots: has-symbols: 1.0.3 isarray: 2.0.5 - safe-buffer@5.1.2: {} - safe-buffer@5.2.1: {} safe-regex-test@1.0.3: @@ -9943,8 +9636,6 @@ snapshots: functions-have-names: 1.2.3 has-property-descriptors: 1.0.2 - setimmediate@1.0.5: {} - sharp@0.34.5: dependencies: '@img/colour': 1.0.0 @@ -10091,10 +9782,6 @@ snapshots: define-properties: 1.2.1 es-object-atoms: 1.0.0 - string_decoder@1.1.1: - dependencies: - safe-buffer: 5.1.2 - string_decoder@1.3.0: dependencies: safe-buffer: 5.2.1 @@ -10226,8 +9913,6 @@ snapshots: dependencies: punycode: 2.3.1 - traverse@0.3.9: {} - tree-kill@1.2.2: {} true-case-path@2.2.1: {} @@ -10351,6 +10036,7 @@ snapshots: get-tsconfig: 4.7.5 optionalDependencies: fsevents: 2.3.3 + optional: true turbo-darwin-64@1.13.3: optional: true @@ -10450,29 +10136,12 @@ snapshots: undici-types@6.21.0: {} - uniffi-bindgen-react-native@0.29.3-1: {} - universalify@0.1.2: {} - unzipper@0.10.11: - dependencies: - big-integer: 1.6.52 - binary: 0.3.0 - bluebird: 3.4.7 - buffer-indexof-polyfill: 1.0.2 - duplexer2: 0.1.4 - fstream: 1.0.12 - graceful-fs: 4.2.11 - listenercount: 1.0.1 - readable-stream: 2.3.8 - setimmediate: 1.0.5 - uri-js@4.4.1: dependencies: punycode: 2.3.1 - util-deprecate@1.0.2: {} - uuid@11.1.0: {} validator@13.12.0: {} From 705ed3304a2c5fab537ef70290ca453622833766 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 29 Jan 2026 10:56:02 +0100 Subject: [PATCH 05/26] reuse --- .../inference/interruption/ws_transport.ts | 3 ++ agents/src/voice/turn_config/endpointing.ts | 3 ++ agents/src/voice/turn_config/interruption.ts | 3 ++ agents/src/voice/turn_config/turnHandling.ts | 3 ++ agents/src/voice/turn_config/utils.test.ts | 36 +++++++++---------- agents/src/voice/turn_config/utils.ts | 3 ++ 6 files changed, 32 insertions(+), 19 deletions(-) diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 94137a622..0c313c282 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import { TransformStream } from 'stream/web'; import WebSocket from 'ws'; import { z } from 'zod'; diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index 28873acd3..4ac0e4dd8 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 /** * Configuration for endpointing, which determines when the user's turn is complete. */ diff --git a/agents/src/voice/turn_config/interruption.ts b/agents/src/voice/turn_config/interruption.ts index 813fd191a..ed391dda3 100644 --- a/agents/src/voice/turn_config/interruption.ts +++ b/agents/src/voice/turn_config/interruption.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 /** * Configuration for interruption handling. */ diff --git a/agents/src/voice/turn_config/turnHandling.ts b/agents/src/voice/turn_config/turnHandling.ts index 6baa05444..de498ef62 100644 --- a/agents/src/voice/turn_config/turnHandling.ts +++ b/agents/src/voice/turn_config/turnHandling.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import type { TurnDetectionMode } from '../agent_session.js'; import { type EndpointingConfig, defaultEndpointingConfig } from './endpointing.js'; import { type InterruptionConfig, defaultInterruptionConfig } from './interruption.js'; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 20b9f9087..39c486ae0 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from 'vitest'; import type { AgentSessionOptions } from '../agent_session.js'; -import { - defaultEndpointingConfig, - defaultInterruptionConfig, - defaultTurnHandlingConfig, -} from './index.js'; +import { defaultEndpointingConfig } from './endpointing.js'; +import { defaultInterruptionConfig } from './interruption.js'; +import { defaultTurnHandlingConfig } from './turnHandling.js'; import { migrateLegacyOptions } from './utils.js'; describe('migrateLegacyOptions', () => { @@ -43,7 +41,7 @@ describe('migrateLegacyOptions', () => { }; const result = migrateLegacyOptions(input); - expect(result.turnHandling!.interruption.mode).toBe(false); + expect(result.turnHandling!.interruption!.mode).toBe(false); expect('voiceOptions' in result).toBe(false); }); @@ -56,7 +54,7 @@ describe('migrateLegacyOptions', () => { const result = migrateLegacyOptions(input); // mode should remain undefined (the default) when allowInterruptions is true - expect(result.turnHandling!.interruption.mode).toBe(defaultInterruptionConfig.mode); + expect(result.turnHandling!.interruption!.mode).toBe(defaultInterruptionConfig.mode); }); it('should migrate voiceOptions interruption settings', () => { @@ -69,9 +67,9 @@ describe('migrateLegacyOptions', () => { }; const result = migrateLegacyOptions(input); - expect(result.turnHandling!.interruption.minDuration).toBe(0.8); - expect(result.turnHandling!.interruption.minWords).toBe(3); - expect(result.turnHandling!.interruption.discardAudioIfUninterruptible).toBe(false); + expect(result.turnHandling!.interruption!.minDuration).toBe(0.8); + expect(result.turnHandling!.interruption!.minWords).toBe(3); + expect(result.turnHandling!.interruption!.discardAudioIfUninterruptible).toBe(false); }); it('should migrate voiceOptions endpointing settings', () => { @@ -83,8 +81,8 @@ describe('migrateLegacyOptions', () => { }; const result = migrateLegacyOptions(input); - expect(result.turnHandling!.endpointing.minDelay).toBe(1.0); - expect(result.turnHandling!.endpointing.maxDelay).toBe(5.0); + expect(result.turnHandling!.endpointing!.minDelay).toBe(1.0); + expect(result.turnHandling!.endpointing!.maxDelay).toBe(5.0); }); it('should migrate voiceOptions.preemptiveGeneration', () => { @@ -126,12 +124,12 @@ describe('migrateLegacyOptions', () => { const result = migrateLegacyOptions(input); expect(result.turnHandling!.turnDetection).toBe('stt'); - expect(result.turnHandling!.interruption.mode).toBe(false); - expect(result.turnHandling!.interruption.discardAudioIfUninterruptible).toBe(false); - expect(result.turnHandling!.interruption.minDuration).toBe(1.0); - expect(result.turnHandling!.interruption.minWords).toBe(2); - expect(result.turnHandling!.endpointing.minDelay).toBe(0.8); - expect(result.turnHandling!.endpointing.maxDelay).toBe(4.0); + expect(result.turnHandling!.interruption!.mode).toBe(false); + expect(result.turnHandling!.interruption!.discardAudioIfUninterruptible).toBe(false); + expect(result.turnHandling!.interruption!.minDuration).toBe(1.0); + expect(result.turnHandling!.interruption!.minWords).toBe(2); + expect(result.turnHandling!.endpointing!.minDelay).toBe(0.8); + expect(result.turnHandling!.endpointing!.maxDelay).toBe(4.0); expect(result.turnHandling!.preemptiveGeneration).toBe(true); expect(result.turnHandling!.userAwayTimeout).toBe(20.0); @@ -161,6 +159,6 @@ describe('migrateLegacyOptions', () => { expect('turnDetection' in result).toBe(false); expect('voiceOptions' in result).toBe(false); expect(result.turnHandling!.turnDetection).toBe('vad'); - expect(result.turnHandling!.endpointing.minDelay).toBe(1.0); + expect(result.turnHandling!.endpointing!.minDelay).toBe(1.0); }); }); diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index 7a65500a5..d22655b6a 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import type { AgentSessionOptions } from '../agent_session.js'; import { defaultEndpointingConfig } from './endpointing.js'; import { defaultInterruptionConfig } from './interruption.js'; From 8f5388935041c8986f69e32a651ab8de285c549e Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 29 Jan 2026 11:14:04 +0100 Subject: [PATCH 06/26] remove tests for legacy stream approach --- .../interruption/ws_transport.test.ts | 243 ------------------ 1 file changed, 243 deletions(-) delete mode 100644 agents/src/inference/interruption/ws_transport.test.ts diff --git a/agents/src/inference/interruption/ws_transport.test.ts b/agents/src/inference/interruption/ws_transport.test.ts deleted file mode 100644 index e44f62fdb..000000000 --- a/agents/src/inference/interruption/ws_transport.test.ts +++ /dev/null @@ -1,243 +0,0 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it } from 'vitest'; -import { WebSocket, WebSocketServer } from 'ws'; -import { webSocketToStream } from './ws_transport.js'; - -/** Helper to create a WebSocket server and return its port */ -async function createServer(): Promise<{ wss: WebSocketServer; port: number }> { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - const port = (wss.address() as { port: number }).port; - return { wss, port }; -} - -/** Helper to create a connected WebSocket client */ -async function createClient(port: number): Promise { - const ws = new WebSocket(`ws://localhost:${port}`); - // await new Promise((resolve, reject) => { - // ws.once('open', resolve); - // ws.once('error', reject); - // }); - return ws; -} - -describe('webSocketToStream', () => { - describe('readable stream', () => { - it('receives messages from the WebSocket', async () => { - const { wss, port } = await createServer(); - - wss.on('connection', (serverWs) => { - serverWs.send('hello'); - serverWs.send('world'); - serverWs.close(); - }); - - const ws = await createClient(port); - const { readable } = webSocketToStream(ws); - const reader = readable.getReader(); - - const messages: string[] = []; - try { - while (true) { - const { done, value } = await reader.read(); - if (done) break; - messages.push(Buffer.from(value).toString()); - } - } finally { - reader.releaseLock(); - } - - expect(messages).toEqual(['hello', 'world']); - - wss.close(); - }); - - it('handles binary messages', async () => { - const { wss, port } = await createServer(); - - const binaryData = new Uint8Array([1, 2, 3, 4, 5]); - - wss.on('connection', (serverWs) => { - serverWs.send(binaryData); - serverWs.close(); - }); - - const ws = await createClient(port); - const { readable } = webSocketToStream(ws); - const reader = readable.getReader(); - - const chunks: Uint8Array[] = []; - try { - while (true) { - const { done, value } = await reader.read(); - if (done) break; - chunks.push(new Uint8Array(value)); - } - } finally { - reader.releaseLock(); - } - - expect(chunks).toHaveLength(1); - expect(Array.from(chunks[0]!)).toEqual([1, 2, 3, 4, 5]); - - wss.close(); - }); - - it('handles empty stream when connection closes immediately', async () => { - const { wss, port } = await createServer(); - - wss.on('connection', (serverWs) => { - serverWs.close(); - }); - - const ws = await createClient(port); - const { readable } = webSocketToStream(ws); - const reader = readable.getReader(); - - const chunks: Uint8Array[] = []; - try { - while (true) { - const { done, value } = await reader.read(); - if (done) break; - chunks.push(value); - } - } finally { - reader.releaseLock(); - } - - expect(chunks).toEqual([]); - - wss.close(); - }); - }); - - describe('writable stream', () => { - it('sends messages through the WebSocket', async () => { - const { wss, port } = await createServer(); - - const messagesReceived: string[] = []; - const serverClosed = new Promise((resolve) => { - wss.on('connection', (serverWs) => { - serverWs.on('message', (data) => { - messagesReceived.push(data.toString()); - }); - serverWs.on('close', resolve); - }); - }); - - const ws = await createClient(port); - const { writable } = webSocketToStream(ws); - const writer = writable.getWriter(); - - await writer.write(new TextEncoder().encode('hello')); - await writer.write(new TextEncoder().encode('world')); - await writer.close(); - - await serverClosed; - - expect(messagesReceived).toEqual(['hello', 'world']); - - wss.close(); - }); - - it('sends binary data through the WebSocket', async () => { - const { wss, port } = await createServer(); - - const chunksReceived: Buffer[] = []; - const serverClosed = new Promise((resolve) => { - wss.on('connection', (serverWs) => { - serverWs.on('message', (data) => { - chunksReceived.push(Buffer.from(data as Buffer)); - }); - serverWs.on('close', resolve); - }); - }); - - const ws = await createClient(port); - const { writable } = webSocketToStream(ws); - const writer = writable.getWriter(); - - const binaryData = new Uint8Array([10, 20, 30, 40, 50]); - await writer.write(binaryData); - await writer.close(); - - await serverClosed; - - expect(chunksReceived).toHaveLength(1); - expect(Array.from(chunksReceived[0]!)).toEqual([10, 20, 30, 40, 50]); - - wss.close(); - }); - }); - - describe('bidirectional communication', () => { - it('supports echo pattern with readable and writable', async () => { - const { wss, port } = await createServer(); - - // Server echoes messages back - wss.on('connection', (serverWs) => { - serverWs.on('message', (data) => { - serverWs.send(data); - }); - }); - - const ws = await createClient(port); - const { readable, writable } = webSocketToStream(ws); - const writer = writable.getWriter(); - const reader = readable.getReader(); - - // Send messages - await writer.write(new TextEncoder().encode('ping1')); - await writer.write(new TextEncoder().encode('ping2')); - - // Read echoed responses - const { value: response1 } = await reader.read(); - const { value: response2 } = await reader.read(); - - expect(Buffer.from(response1!).toString()).toBe('ping1'); - expect(Buffer.from(response2!).toString()).toBe('ping2'); - - reader.releaseLock(); - await writer.close(); - - wss.close(); - }); - }); - - describe('error handling', () => { - it('readable stream ends when WebSocket closes unexpectedly', async () => { - const { wss, port } = await createServer(); - - wss.on('connection', (serverWs) => { - serverWs.send('before close'); - // Terminate connection abruptly - serverWs.terminate(); - }); - - const ws = await createClient(port); - const { readable } = webSocketToStream(ws); - const reader = readable.getReader(); - - const chunks: string[] = []; - try { - while (true) { - const { done, value } = await reader.read(); - if (done) break; - chunks.push(Buffer.from(value).toString()); - } - } catch { - // Connection terminated, stream may error - } finally { - reader.releaseLock(); - } - - // Should have received the message sent before termination - expect(chunks).toContain('before close'); - - wss.close(); - }); - }); -}); From 7d24bf0380b73b52a193c34a2e1c3c68f090b2a2 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 29 Jan 2026 11:22:52 +0100 Subject: [PATCH 07/26] fix util migration tests --- agents/src/voice/turn_config/utils.test.ts | 16 ++++++++-------- agents/src/voice/turn_config/utils.ts | 6 +++++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 39c486ae0..02e93fc01 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -30,7 +30,7 @@ describe('migrateLegacyOptions', () => { const result = migrateLegacyOptions(input); expect(result.turnHandling!.turnDetection).toBe('vad'); - expect('turnDetection' in result).toBe(false); + expect(result.turnDetection).toBe('vad'); }); it('should set interruption.mode to false when allowInterruptions is false', () => { @@ -42,7 +42,7 @@ describe('migrateLegacyOptions', () => { const result = migrateLegacyOptions(input); expect(result.turnHandling!.interruption!.mode).toBe(false); - expect('voiceOptions' in result).toBe(false); + expect(result.voiceOptions?.allowInterruptions).toBe(false); }); it('should not set interruption.mode when allowInterruptions is true', () => { @@ -133,9 +133,9 @@ describe('migrateLegacyOptions', () => { expect(result.turnHandling!.preemptiveGeneration).toBe(true); expect(result.turnHandling!.userAwayTimeout).toBe(20.0); - // Legacy options should be stripped - expect('turnDetection' in result).toBe(false); - expect('voiceOptions' in result).toBe(false); + // Legacy options should still be available + expect('turnDetection' in result).toBeDefined(); + expect('voiceOptions' in result).toBeDefined(); }); it('should preserve non-legacy options in the result', () => { @@ -155,9 +155,9 @@ describe('migrateLegacyOptions', () => { expect(result.maxToolSteps).toBe(5); expect(result.connOptions).toEqual({ maxUnrecoverableErrors: 10 }); - // Legacy options should be stripped and migrated - expect('turnDetection' in result).toBe(false); - expect('voiceOptions' in result).toBe(false); + // Legacy options should still be available and mirror the new options + expect(result.turnDetection).toBe('vad'); + expect(result.voiceOptions?.minEndpointingDelay).toBe(1.0); expect(result.turnHandling!.turnDetection).toBe('vad'); expect(result.turnHandling!.endpointing!.minDelay).toBe(1.0); }); diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index d22655b6a..117a3257f 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -32,7 +32,11 @@ export function migrateLegacyOptions( ...rest.turnHandling, }; - const newAgentSessionOptions: AgentSessionOptions = { ...rest, turnHandling }; + const newAgentSessionOptions: AgentSessionOptions = { + ...rest, + turnDetection: turnHandling.turnDetection, + turnHandling, + }; if (voiceOptions?.allowInterruptions === false) { turnHandling.interruption.mode = false; From c78cf58aa38cccb4f9dde72d64af1986c4a014eb Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 29 Jan 2026 11:31:42 +0100 Subject: [PATCH 08/26] comment out example tests --- .github/workflows/test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9c09430b3..b4472c81b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,11 +46,11 @@ jobs: - name: Test agents if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name == 'push' run: pnpm test agents - - name: Test examples - if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: pnpm test:examples + # - name: Test examples + # if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') + # env: + # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # run: pnpm test:examples # TODO (AJS-83) Re-enable once plugins are refactored with abort controllers # - name: Test all plugins # if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name != 'pull_request' From d5b271c421288ce85e7a1efce0baef9d5e5bb8c3 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 30 Jan 2026 17:01:27 +0800 Subject: [PATCH 09/26] Rename files to underscore cases (#1007) --- agents/src/inference/interruption/defaults.ts | 2 +- agents/src/inference/interruption/http_transport.ts | 2 +- ...{InterruptionCacheEntry.ts => interruption_cache_entry.ts} | 0 ...aptiveInterruptionDetector.ts => interruption_detector.ts} | 2 +- .../{InterruptionStream.ts => interruption_stream.ts} | 4 ++-- agents/src/inference/interruption/ws_transport.ts | 2 +- agents/src/voice/agent.ts | 2 +- agents/src/voice/agent_activity.ts | 3 +-- agents/src/voice/agent_session.ts | 2 +- agents/src/voice/audio_recognition.ts | 4 ++-- .../voice/turn_config/{turnHandling.ts => turn_handling.ts} | 0 agents/src/voice/turn_config/utils.test.ts | 2 +- agents/src/voice/turn_config/utils.ts | 2 +- 13 files changed, 13 insertions(+), 14 deletions(-) rename agents/src/inference/interruption/{InterruptionCacheEntry.ts => interruption_cache_entry.ts} (100%) rename agents/src/inference/interruption/{AdaptiveInterruptionDetector.ts => interruption_detector.ts} (98%) rename agents/src/inference/interruption/{InterruptionStream.ts => interruption_stream.ts} (98%) rename agents/src/voice/turn_config/{turnHandling.ts => turn_handling.ts} (100%) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index cd7988f6a..2f1613f43 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import type { ApiConnectOptions } from './InterruptionStream.js'; +import type { ApiConnectOptions } from './interruption_stream.js'; import type { InterruptionOptions } from './types.js'; export const MIN_INTERRUPTION_DURATION_IN_S = 0.025 * 2; // 25ms per frame, 2 consecutive frames diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 82ee8b2a0..2b1bc1089 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -6,8 +6,8 @@ import { TransformStream } from 'stream/web'; import { z } from 'zod'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; -import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; import { intervalForRetry } from './defaults.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; import { type InterruptionEvent, InterruptionEventType } from './types.js'; import type { BoundedCache } from './utils.js'; diff --git a/agents/src/inference/interruption/InterruptionCacheEntry.ts b/agents/src/inference/interruption/interruption_cache_entry.ts similarity index 100% rename from agents/src/inference/interruption/InterruptionCacheEntry.ts rename to agents/src/inference/interruption/interruption_cache_entry.ts diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/interruption_detector.ts similarity index 98% rename from agents/src/inference/interruption/AdaptiveInterruptionDetector.ts rename to agents/src/inference/interruption/interruption_detector.ts index 33f526fcc..2722ecf39 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/interruption_detector.ts @@ -4,7 +4,6 @@ import type { TypedEventEmitter } from '@livekit/typed-emitter'; import EventEmitter from 'events'; import { log } from '../../log.js'; -import { InterruptionStreamBase } from './InterruptionStream.js'; import { DEFAULT_BASE_URL, FRAMES_PER_SECOND, @@ -12,6 +11,7 @@ import { interruptionOptionDefaults, } from './defaults.js'; import type { InterruptionDetectionError } from './errors.js'; +import { InterruptionStreamBase } from './interruption_stream.js'; import type { InterruptionEvent, InterruptionOptions } from './types.js'; type InterruptionCallbacks = { diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/interruption_stream.ts similarity index 98% rename from agents/src/inference/interruption/InterruptionStream.ts rename to agents/src/inference/interruption/interruption_stream.ts index a91cbe1f6..36c68da62 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/interruption_stream.ts @@ -7,11 +7,11 @@ import { type ReadableStream, TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; import { traceTypes } from '../../telemetry/index.js'; -import type { AdaptiveInterruptionDetector } from './AdaptiveInterruptionDetector.js'; -import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; import { FRAMES_PER_SECOND, apiConnectDefaults } from './defaults.js'; import type { InterruptionDetectionError } from './errors.js'; import { createHttpTransport } from './http_transport.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import type { AdaptiveInterruptionDetector } from './interruption_detector.js'; import { type AgentSpeechEnded, type AgentSpeechStarted, diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 0c313c282..d85a488dd 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -6,8 +6,8 @@ import WebSocket from 'ws'; import { z } from 'zod'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; -import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; import { intervalForRetry } from './defaults.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; import { type InterruptionEvent, InterruptionEventType } from './types.js'; import type { BoundedCache } from './utils.js'; diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index f587adbab..cc94f9e71 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -30,7 +30,7 @@ import type { VAD } from '../vad.js'; import type { AgentActivity } from './agent_activity.js'; import type { AgentSession, TurnDetectionMode } from './agent_session.js'; import type { InterruptionConfig } from './turn_config/interruption.js'; -import type { TurnHandlingConfig } from './turn_config/turnHandling.js'; +import type { TurnHandlingConfig } from './turn_config/turn_handling.js'; export const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>(); export const STOP_RESPONSE_SYMBOL = Symbol('StopResponse'); diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index e9be7e90c..906c624d9 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -8,7 +8,7 @@ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api' import { Heap } from 'heap-js'; import { AsyncLocalStorage } from 'node:async_hooks'; import { ReadableStream } from 'node:stream/web'; -import { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; +import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; import type { InterruptionEvent } from '../inference/interruption/types.js'; import { type ChatContext, ChatMessage } from '../llm/chat_context.js'; import { @@ -52,7 +52,6 @@ import { type EndOfTurnInfo, type PreemptiveGenerationInfo, type RecognitionHooks, - type _TurnDetector, } from './audio_recognition.js'; import { AgentSessionEventTypes, diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 20c989849..62537d566 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -64,7 +64,7 @@ import type { UnknownUserData } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; import { RunResult } from './testing/run_result.js'; import type { InterruptionConfig } from './turn_config/interruption.js'; -import type { TurnHandlingConfig } from './turn_config/turnHandling.js'; +import type { TurnHandlingConfig } from './turn_config/turn_handling.js'; import { migrateLegacyOptions } from './turn_config/utils.js'; export interface VoiceOptions { diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 805130ee1..b6aa67887 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -5,8 +5,8 @@ import { AudioFrame } from '@livekit/rtc-node'; import type { Context, Span } from '@opentelemetry/api'; import type { WritableStreamDefaultWriter } from 'node:stream/web'; import { ReadableStream } from 'node:stream/web'; -import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; -import { InterruptionStreamSentinel } from '../inference/interruption/InterruptionStream.js'; +import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; +import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js'; import { type InterruptionEvent, InterruptionEventType, diff --git a/agents/src/voice/turn_config/turnHandling.ts b/agents/src/voice/turn_config/turn_handling.ts similarity index 100% rename from agents/src/voice/turn_config/turnHandling.ts rename to agents/src/voice/turn_config/turn_handling.ts diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 02e93fc01..ff8960432 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -5,7 +5,7 @@ import { describe, expect, it } from 'vitest'; import type { AgentSessionOptions } from '../agent_session.js'; import { defaultEndpointingConfig } from './endpointing.js'; import { defaultInterruptionConfig } from './interruption.js'; -import { defaultTurnHandlingConfig } from './turnHandling.js'; +import { defaultTurnHandlingConfig } from './turn_handling.js'; import { migrateLegacyOptions } from './utils.js'; describe('migrateLegacyOptions', () => { diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index 117a3257f..a5b1c48dd 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -4,7 +4,7 @@ import type { AgentSessionOptions } from '../agent_session.js'; import { defaultEndpointingConfig } from './endpointing.js'; import { defaultInterruptionConfig } from './interruption.js'; -import { type TurnHandlingConfig, defaultTurnHandlingConfig } from './turnHandling.js'; +import { type TurnHandlingConfig, defaultTurnHandlingConfig } from './turn_handling.js'; export function migrateLegacyOptions( legacyOptions: AgentSessionOptions, From b020180c3b0383c22a620635ed072fce81352554 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 13:49:45 +0100 Subject: [PATCH 10/26] update date --- agents/src/inference/interruption/defaults.ts | 2 +- agents/src/inference/interruption/errors.ts | 2 +- agents/src/inference/interruption/http_transport.ts | 2 +- agents/src/inference/interruption/interruption_cache_entry.ts | 2 +- agents/src/inference/interruption/interruption_stream.ts | 2 +- agents/src/inference/interruption/types.ts | 2 +- agents/src/inference/interruption/utils.ts | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 2f1613f43..2a2174558 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import type { ApiConnectOptions } from './interruption_stream.js'; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts index a346b7d28..30842fc0f 100644 --- a/agents/src/inference/interruption/errors.ts +++ b/agents/src/inference/interruption/errors.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 /** diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 2b1bc1089..43a7f4e05 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import { ofetch } from 'ofetch'; diff --git a/agents/src/inference/interruption/interruption_cache_entry.ts b/agents/src/inference/interruption/interruption_cache_entry.ts index e6da964d8..600e25da6 100644 --- a/agents/src/inference/interruption/interruption_cache_entry.ts +++ b/agents/src/inference/interruption/interruption_cache_entry.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import { estimateProbability } from './utils.js'; diff --git a/agents/src/inference/interruption/interruption_stream.ts b/agents/src/inference/interruption/interruption_stream.ts index 36c68da62..6409f77fb 100644 --- a/agents/src/inference/interruption/interruption_stream.ts +++ b/agents/src/inference/interruption/interruption_stream.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts index 0bc17dd6f..85c771646 100644 --- a/agents/src/inference/interruption/types.ts +++ b/agents/src/inference/interruption/types.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import type { Span } from '@opentelemetry/api'; diff --git a/agents/src/inference/interruption/utils.ts b/agents/src/inference/interruption/utils.ts index 0c5a4bf40..71037bc03 100644 --- a/agents/src/inference/interruption/utils.ts +++ b/agents/src/inference/interruption/utils.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; From dbad1e4bd673d88e97986e0092bcbee102cb3ff5 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 13:50:59 +0100 Subject: [PATCH 11/26] update date --- agents/src/voice/turn_config/endpointing.ts | 2 +- agents/src/voice/turn_config/interruption.ts | 2 +- agents/src/voice/turn_config/turn_handling.ts | 2 +- agents/src/voice/turn_config/utils.test.ts | 2 +- agents/src/voice/turn_config/utils.ts | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index 4ac0e4dd8..1c0747ce3 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 /** diff --git a/agents/src/voice/turn_config/interruption.ts b/agents/src/voice/turn_config/interruption.ts index ed391dda3..50616a649 100644 --- a/agents/src/voice/turn_config/interruption.ts +++ b/agents/src/voice/turn_config/interruption.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 /** diff --git a/agents/src/voice/turn_config/turn_handling.ts b/agents/src/voice/turn_config/turn_handling.ts index de498ef62..4e92d3789 100644 --- a/agents/src/voice/turn_config/turn_handling.ts +++ b/agents/src/voice/turn_config/turn_handling.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import type { TurnDetectionMode } from '../agent_session.js'; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index ff8960432..4dc0e1f5d 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from 'vitest'; diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index a5b1c48dd..8b572ae3e 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import type { AgentSessionOptions } from '../agent_session.js'; From d88201222bd2ec3d34a314eb816d041b687376fc Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 13:52:29 +0100 Subject: [PATCH 12/26] update defaults --- agents/src/inference/interruption/defaults.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 2a2174558..a7f36e5b9 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -43,7 +43,7 @@ export const interruptionOptionDefaults: InterruptionOptions = { maxAudioDurationInS: MAX_AUDIO_DURATION_IN_S, audioPrefixDurationInS: AUDIO_PREFIX_DURATION_IN_S, detectionIntervalInS: DETECTION_INTERVAL_IN_S, - inferenceTimeout: 10_000, + inferenceTimeout: 1_000, baseUrl: DEFAULT_BASE_URL, apiKey: process.env.LIVEKIT_API_KEY || '', apiSecret: process.env.LIVEKIT_API_SECRET || '', From 67e8f6c449e37604921c96fa1d935815c66b167e Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 15:14:20 +0100 Subject: [PATCH 13/26] deprecate legacy options and update tests --- agents/src/voice/agent_session.ts | 79 +++++--- agents/src/voice/turn_config/turn_handling.ts | 17 -- agents/src/voice/turn_config/utils.test.ts | 186 ++++++------------ agents/src/voice/turn_config/utils.ts | 87 ++++---- 4 files changed, 162 insertions(+), 207 deletions(-) diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 62537d566..e581a8ce0 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -67,29 +67,51 @@ import type { InterruptionConfig } from './turn_config/interruption.js'; import type { TurnHandlingConfig } from './turn_config/turn_handling.js'; import { migrateLegacyOptions } from './turn_config/utils.js'; -export interface VoiceOptions { - allowInterruptions: boolean; - discardAudioIfUninterruptible: boolean; - minInterruptionDuration: number; - minInterruptionWords: number; - minEndpointingDelay: number; - maxEndpointingDelay: number; +export interface SessionOptions { maxToolSteps: number; + /** + * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected. + * When `true`, the agent sends inference calls as soon as a user transcript is received rather + * than waiting for a definitive turn boundary. This can reduce response latency by overlapping + * model inference with user audio, but may incur extra compute if the user interrupts or + * revises mid-utterance. + * @defaultValue false + */ preemptiveGeneration: boolean; - userAwayTimeout?: number | null; + /** + * If set, set the user state as "away" after this amount of time after user and agent are + * silent. Set to `undefined` to disable. + * @defaultValue 15.0 + */ + userAwayTimeout: number | null; + /** + * Configuration for turn handling. + */ + turnHandling: Partial; + + /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.mode instead. */ + allowInterruptions?: boolean; + /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */ + discardAudioIfUninterruptible?: boolean; + /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.minDuration instead. */ + minInterruptionDuration?: number; + /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.minWords instead. */ + minInterruptionWords?: number; + /** @deprecated Use {@link VoiceOptions.turnHandling}.endpointing.minDelay instead. */ + minEndpointingDelay?: number; + /** @deprecated Use {@link VoiceOptions.turnHandling}.endpointing.maxDelay instead. */ + maxEndpointingDelay?: number; } -// const defaultVoiceOptions: VoiceOptions = { -// allowInterruptions: true, -// discardAudioIfUninterruptible: true, -// minInterruptionDuration: 500, -// minInterruptionWords: 0, -// minEndpointingDelay: 500, -// maxEndpointingDelay: 6000, -// maxToolSteps: 3, -// preemptiveGeneration: false, -// userAwayTimeout: 15.0, -// } as const; +export const defaultSessionOptions = { + maxToolSteps: 3, + preemptiveGeneration: false, + userAwayTimeout: 15.0, + turnHandling: {}, +} as const satisfies SessionOptions; + +/** @deprecated {@link VoiceOptions} has been renamed to {@link SessionOptions} */ +export type VoiceOptions = SessionOptions; export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector; @@ -108,16 +130,18 @@ export type AgentSessionCallbacks = { }; export type AgentSessionOptions = { - turnDetection?: TurnDetectionMode; stt?: STT | STTModelString; vad?: VAD; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; userData?: UserData; - voiceOptions?: Partial; + options?: Partial; connOptions?: SessionConnectOptions; - turnHandling?: Partial; - maxToolSteps?: number; + + /** @deprecated use {@link AgentSessionOptions.options}.turnHandling.turnDetection instead */ + turnDetection?: TurnDetectionMode; + /** @deprecated use {@link AgentSessionOptions.options} instead */ + voiceOptions?: Partial; }; export class AgentSession< @@ -188,7 +212,7 @@ export class AgentSession< const opts = migrateLegacyOptions(options); - const { vad, stt, llm, tts, userData, connOptions, turnHandling } = opts; + const { vad, stt, llm, tts, userData, connOptions, options: sessionOptions } = opts; // Merge user-provided connOptions with defaults this._connOptions = { sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions }, @@ -219,8 +243,8 @@ export class AgentSession< this.tts = tts; } - this.turnDetection = turnHandling?.turnDetection; - this._interruptionDetection = turnHandling?.interruption?.mode; + this.turnDetection = sessionOptions?.turnHandling?.turnDetection; + this._interruptionDetection = sessionOptions?.turnHandling?.interruption?.mode; this._userData = userData; // configurable IO @@ -230,8 +254,7 @@ export class AgentSession< // This is the "global" chat context, it holds the entire conversation history this._chatCtx = ChatContext.empty(); - // @ts-ignore FIXME the return type of the migration util has all defaults filled - this.options = opts.voiceOptions; + this.options = opts.options; this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this); this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed); diff --git a/agents/src/voice/turn_config/turn_handling.ts b/agents/src/voice/turn_config/turn_handling.ts index 4e92d3789..8505701de 100644 --- a/agents/src/voice/turn_config/turn_handling.ts +++ b/agents/src/voice/turn_config/turn_handling.ts @@ -31,27 +31,10 @@ export interface TurnHandlingConfig { * Configuration for interruption handling. */ interruption: InterruptionConfig; - /** - * If set, set the user state as "away" after this amount of time after user and agent are - * silent. Set to `undefined` to disable. - * @defaultValue 15.0 - */ - userAwayTimeout: number; - /** - * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected. - * When `true`, the agent sends inference calls as soon as a user transcript is received rather - * than waiting for a definitive turn boundary. This can reduce response latency by overlapping - * model inference with user audio, but may incur extra compute if the user interrupts or - * revises mid-utterance. - * @defaultValue false - */ - preemptiveGeneration: boolean; } export const defaultTurnHandlingConfig: TurnHandlingConfig = { turnDetection: undefined, interruption: defaultInterruptionConfig, endpointing: defaultEndpointingConfig, - userAwayTimeout: 15, - preemptiveGeneration: false, }; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts index 4dc0e1f5d..1b0d1381c 100644 --- a/agents/src/voice/turn_config/utils.test.ts +++ b/agents/src/voice/turn_config/utils.test.ts @@ -1,164 +1,100 @@ // SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it } from 'vitest'; -import type { AgentSessionOptions } from '../agent_session.js'; +import { beforeAll, describe, expect, it } from 'vitest'; +import { initializeLogger } from '../../log.js'; import { defaultEndpointingConfig } from './endpointing.js'; import { defaultInterruptionConfig } from './interruption.js'; import { defaultTurnHandlingConfig } from './turn_handling.js'; import { migrateLegacyOptions } from './utils.js'; -describe('migrateLegacyOptions', () => { - it('should return default turn handling config when no legacy options provided', () => { - const input: AgentSessionOptions = {}; - const result = migrateLegacyOptions(input); - - expect(result.turnHandling).toBeDefined(); - expect(result.turnHandling!.turnDetection).toBe(defaultTurnHandlingConfig.turnDetection); - expect(result.turnHandling!.userAwayTimeout).toBe(defaultTurnHandlingConfig.userAwayTimeout); - expect(result.turnHandling!.preemptiveGeneration).toBe( - defaultTurnHandlingConfig.preemptiveGeneration, - ); - expect(result.turnHandling!.interruption).toMatchObject(defaultInterruptionConfig); - expect(result.turnHandling!.endpointing).toMatchObject(defaultEndpointingConfig); - }); - - it('should migrate legacy turnDetection to turnHandling.turnDetection', () => { - const input: AgentSessionOptions = { - turnDetection: 'vad', - }; - const result = migrateLegacyOptions(input); - - expect(result.turnHandling!.turnDetection).toBe('vad'); - expect(result.turnDetection).toBe('vad'); - }); - - it('should set interruption.mode to false when allowInterruptions is false', () => { - const input: AgentSessionOptions = { - voiceOptions: { - allowInterruptions: false, - }, - }; - const result = migrateLegacyOptions(input); - - expect(result.turnHandling!.interruption!.mode).toBe(false); - expect(result.voiceOptions?.allowInterruptions).toBe(false); - }); - - it('should not set interruption.mode when allowInterruptions is true', () => { - const input: AgentSessionOptions = { - voiceOptions: { - allowInterruptions: true, - }, - }; - const result = migrateLegacyOptions(input); +beforeAll(() => { + initializeLogger({ pretty: true, level: 'info' }); +}); - // mode should remain undefined (the default) when allowInterruptions is true - expect(result.turnHandling!.interruption!.mode).toBe(defaultInterruptionConfig.mode); +describe('migrateLegacyOptions', () => { + it('should return all defaults when no options are provided', () => { + const result = migrateLegacyOptions({}); + + expect(result.options.turnHandling).toEqual({ + turnDetection: defaultTurnHandlingConfig.turnDetection, + endpointing: defaultEndpointingConfig, + interruption: defaultInterruptionConfig, + }); + expect(result.options.maxToolSteps).toBe(3); + expect(result.options.preemptiveGeneration).toBe(false); + expect(result.options.userAwayTimeout).toBe(15.0); }); - it('should migrate voiceOptions interruption settings', () => { - const input: AgentSessionOptions = { + it('should migrate legacy flat fields into nested turnHandling config', () => { + const result = migrateLegacyOptions({ voiceOptions: { - minInterruptionDuration: 0.8, + minInterruptionDuration: 1.0, minInterruptionWords: 3, discardAudioIfUninterruptible: false, - }, - }; - const result = migrateLegacyOptions(input); - - expect(result.turnHandling!.interruption!.minDuration).toBe(0.8); - expect(result.turnHandling!.interruption!.minWords).toBe(3); - expect(result.turnHandling!.interruption!.discardAudioIfUninterruptible).toBe(false); - }); - - it('should migrate voiceOptions endpointing settings', () => { - const input: AgentSessionOptions = { - voiceOptions: { - minEndpointingDelay: 1.0, + minEndpointingDelay: 0.8, maxEndpointingDelay: 5.0, }, - }; - const result = migrateLegacyOptions(input); + }); - expect(result.turnHandling!.endpointing!.minDelay).toBe(1.0); - expect(result.turnHandling!.endpointing!.maxDelay).toBe(5.0); + expect(result.options.turnHandling.interruption!.minDuration).toBe(1.0); + expect(result.options.turnHandling.interruption!.minWords).toBe(3); + expect(result.options.turnHandling.interruption!.discardAudioIfUninterruptible).toBe(false); + expect(result.options.turnHandling.endpointing!.minDelay).toBe(0.8); + expect(result.options.turnHandling.endpointing!.maxDelay).toBe(5.0); }); - it('should migrate voiceOptions.preemptiveGeneration', () => { - const input: AgentSessionOptions = { - voiceOptions: { - preemptiveGeneration: true, + it('should set interruption.mode to false when allowInterruptions is false', () => { + const result = migrateLegacyOptions({ + options: { + allowInterruptions: false, }, - }; - const result = migrateLegacyOptions(input); + }); - expect(result.turnHandling!.preemptiveGeneration).toBe(true); + expect(result.options.turnHandling.interruption!.mode).toBe(false); }); - it('should migrate voiceOptions.userAwayTimeout', () => { - const input: AgentSessionOptions = { + it('should give options precedence over voiceOptions when both are provided', () => { + const result = migrateLegacyOptions({ voiceOptions: { - userAwayTimeout: 30.0, + minInterruptionDuration: 1.0, + maxEndpointingDelay: 5.0, + maxToolSteps: 10, + }, + options: { + minInterruptionDuration: 2.0, + maxEndpointingDelay: 8.0, + maxToolSteps: 5, }, - }; - const result = migrateLegacyOptions(input); + }); - expect(result.turnHandling!.userAwayTimeout).toBe(30.0); + expect(result.options.turnHandling.interruption!.minDuration).toBe(2.0); + expect(result.options.turnHandling.endpointing!.maxDelay).toBe(8.0); + expect(result.options.maxToolSteps).toBe(5); }); - it('should migrate all legacy options together', () => { - const input: AgentSessionOptions = { - turnDetection: 'stt', - voiceOptions: { - allowInterruptions: false, - discardAudioIfUninterruptible: false, + it('should let explicit turnHandling override legacy flat fields', () => { + const result = migrateLegacyOptions({ + options: { minInterruptionDuration: 1.0, - minInterruptionWords: 2, minEndpointingDelay: 0.8, - maxEndpointingDelay: 4.0, - preemptiveGeneration: true, - userAwayTimeout: 20.0, + turnHandling: { + interruption: { minDuration: 3.0 }, + endpointing: { minDelay: 2.0 }, + }, }, - }; - const result = migrateLegacyOptions(input); + }); - expect(result.turnHandling!.turnDetection).toBe('stt'); - expect(result.turnHandling!.interruption!.mode).toBe(false); - expect(result.turnHandling!.interruption!.discardAudioIfUninterruptible).toBe(false); - expect(result.turnHandling!.interruption!.minDuration).toBe(1.0); - expect(result.turnHandling!.interruption!.minWords).toBe(2); - expect(result.turnHandling!.endpointing!.minDelay).toBe(0.8); - expect(result.turnHandling!.endpointing!.maxDelay).toBe(4.0); - expect(result.turnHandling!.preemptiveGeneration).toBe(true); - expect(result.turnHandling!.userAwayTimeout).toBe(20.0); - - // Legacy options should still be available - expect('turnDetection' in result).toBeDefined(); - expect('voiceOptions' in result).toBeDefined(); + expect(result.options.turnHandling.interruption!.minDuration).toBe(3.0); + expect(result.options.turnHandling.endpointing!.minDelay).toBe(2.0); }); - it('should preserve non-legacy options in the result', () => { - const input: AgentSessionOptions = { + it('should preserve top-level turnDetection in the result', () => { + const result = migrateLegacyOptions({ turnDetection: 'vad', - voiceOptions: { - minEndpointingDelay: 1.0, - }, - maxToolSteps: 5, - connOptions: { - maxUnrecoverableErrors: 10, - }, - }; - const result = migrateLegacyOptions(input); - - // Non-legacy options should be preserved - expect(result.maxToolSteps).toBe(5); - expect(result.connOptions).toEqual({ maxUnrecoverableErrors: 10 }); + }); - // Legacy options should still be available and mirror the new options expect(result.turnDetection).toBe('vad'); - expect(result.voiceOptions?.minEndpointingDelay).toBe(1.0); - expect(result.turnHandling!.turnDetection).toBe('vad'); - expect(result.turnHandling!.endpointing!.minDelay).toBe(1.0); + expect(result.options.turnHandling.turnDetection).toBe('vad'); }); }); diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index 8b572ae3e..816038255 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -1,61 +1,74 @@ // SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import type { AgentSessionOptions } from '../agent_session.js'; +import { log } from '../../log.js'; +import { + type AgentSessionOptions, + type SessionOptions, + defaultSessionOptions, +} from '../agent_session.js'; import { defaultEndpointingConfig } from './endpointing.js'; import { defaultInterruptionConfig } from './interruption.js'; import { type TurnHandlingConfig, defaultTurnHandlingConfig } from './turn_handling.js'; export function migrateLegacyOptions( legacyOptions: AgentSessionOptions, -): AgentSessionOptions { - const { voiceOptions, turnDetection, ...rest } = legacyOptions; +): AgentSessionOptions & { options: SessionOptions } { + const logger = log(); + const { voiceOptions, turnDetection, options: sessionOptions, ...rest } = legacyOptions; + + if (voiceOptions !== undefined && sessionOptions !== undefined) { + logger.warn( + 'Both voiceOptions and options have been supplied as part of the AgentSessionOptions, voiceOptions will be merged with options taking precedence', + ); + } + + const mergedOptions = structuredClone({ ...voiceOptions, ...sessionOptions }); + const turnHandling: TurnHandlingConfig = { - turnDetection: turnDetection ?? defaultTurnHandlingConfig.turnDetection, + turnDetection: turnDetection, interruption: { - ...defaultInterruptionConfig, - discardAudioIfUninterruptible: - voiceOptions?.discardAudioIfUninterruptible ?? - defaultInterruptionConfig.discardAudioIfUninterruptible, - minDuration: voiceOptions?.minInterruptionDuration ?? defaultInterruptionConfig.minDuration, - minWords: voiceOptions?.minInterruptionWords ?? defaultInterruptionConfig.minWords, + discardAudioIfUninterruptible: mergedOptions?.discardAudioIfUninterruptible, + minDuration: mergedOptions?.minInterruptionDuration, + minWords: mergedOptions?.minInterruptionWords, }, endpointing: { - ...defaultEndpointingConfig, - minDelay: voiceOptions?.minEndpointingDelay ?? defaultEndpointingConfig.minDelay, - maxDelay: voiceOptions?.maxEndpointingDelay ?? defaultEndpointingConfig.maxDelay, + minDelay: mergedOptions?.minEndpointingDelay, + maxDelay: mergedOptions?.maxEndpointingDelay, }, - userAwayTimeout: voiceOptions?.userAwayTimeout ?? defaultTurnHandlingConfig.userAwayTimeout, - preemptiveGeneration: - voiceOptions?.preemptiveGeneration ?? defaultTurnHandlingConfig.preemptiveGeneration, - ...rest.turnHandling, + ...mergedOptions.turnHandling, + } as const; + + if (mergedOptions?.allowInterruptions === false) { + turnHandling.interruption.mode = false; + } + + const optionsWithDefaults = { + ...defaultSessionOptions, + ...mergedOptions, + turnHandling: mergeWithDefaults(turnHandling), }; - const newAgentSessionOptions: AgentSessionOptions = { + const newAgentSessionOptions: AgentSessionOptions & { options: SessionOptions } = { ...rest, + options: optionsWithDefaults, + voiceOptions: optionsWithDefaults, turnDetection: turnHandling.turnDetection, - turnHandling, }; - if (voiceOptions?.allowInterruptions === false) { - turnHandling.interruption.mode = false; - } + return newAgentSessionOptions; +} - newAgentSessionOptions.turnHandling = turnHandling; - if (voiceOptions?.maxToolSteps) { - newAgentSessionOptions.maxToolSteps = voiceOptions.maxToolSteps; - } +/** Remove keys whose value is `undefined` so they don't shadow defaults when spread. */ +export function stripUndefined(obj: T): Partial { + return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined)) as Partial; +} - newAgentSessionOptions.voiceOptions = { - maxToolSteps: newAgentSessionOptions.maxToolSteps, - maxEndpointingDelay: turnHandling.endpointing.maxDelay, - minEndpointingDelay: turnHandling.endpointing.minDelay, - minInterruptionDuration: turnHandling.interruption.minDuration, - minInterruptionWords: turnHandling.interruption.minWords, - allowInterruptions: turnHandling.interruption.mode !== false, - discardAudioIfUninterruptible: turnHandling.interruption.discardAudioIfUninterruptible, - userAwayTimeout: turnHandling.userAwayTimeout, - }; - return newAgentSessionOptions; +export function mergeWithDefaults(config: TurnHandlingConfig) { + return { + turnDetection: config.turnDetection ?? defaultTurnHandlingConfig.turnDetection, + endpointing: { ...defaultEndpointingConfig, ...stripUndefined(config.endpointing) }, + interruption: { ...defaultInterruptionConfig, ...stripUndefined(config.interruption) }, + } as const; } From 96d6b578e0a4540507d9c0f956995669885cefb0 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 15:29:26 +0100 Subject: [PATCH 14/26] fix internal types --- agents/src/voice/agent_activity.ts | 20 +++++++++++-------- agents/src/voice/agent_session.ts | 11 ++++++++-- agents/src/voice/turn_config/endpointing.ts | 4 ++-- agents/src/voice/turn_config/interruption.ts | 12 +++++------ agents/src/voice/turn_config/turn_handling.ts | 9 +++++++-- agents/src/voice/turn_config/utils.ts | 8 +++++--- 6 files changed, 41 insertions(+), 23 deletions(-) diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 906c624d9..9e29e233e 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -312,8 +312,8 @@ export class AgentActivity implements RecognitionHooks { turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, interruptionDetection: this.interruptionDetector, - minEndpointingDelay: this.agentSession.options.minEndpointingDelay, - maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, + minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay, + maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay, rootSpanContext: this.agentSession.rootSpanContext, }); this.audioRecognition.start(); @@ -373,7 +373,7 @@ export class AgentActivity implements RecognitionHooks { get allowInterruptions(): boolean { // TODO(AJS-51): Allow options to be defined in Agent class - return this.agentSession.options.allowInterruptions; + return this.agentSession.options.turnHandling.interruption?.mode !== false; } get turnDetection(): TurnDetectionMode | undefined { @@ -702,7 +702,7 @@ export class AgentActivity implements RecognitionHooks { return; } - if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) { + if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) { this.interruptByAudioActivity(); } } @@ -721,7 +721,11 @@ export class AgentActivity implements RecognitionHooks { // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0 // - Apply check to all STT results: empty string, undefined, or any length // - This ensures consistent behavior across all interruption scenarios - if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) { + if ( + this.stt && + this.agentSession.options.turnHandling.interruption?.minWords > 0 && + this.audioRecognition + ) { const text = this.audioRecognition.currentTranscript; // TODO(shubhra): better word splitting for multi-language @@ -731,7 +735,7 @@ export class AgentActivity implements RecognitionHooks { // Only allow interruption if word count meets or exceeds minInterruptionWords // This applies to all cases: empty strings, partial speech, and full speech - if (wordCount < this.agentSession.options.minInterruptionWords) { + if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) { return; } } @@ -906,10 +910,10 @@ export class AgentActivity implements RecognitionHooks { this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && - this.agentSession.options.minInterruptionWords > 0 + this.agentSession.options.turnHandling.interruption?.minWords > 0 ) { const wordCount = splitWords(info.newTranscript, true).length; - if (wordCount < this.agentSession.options.minInterruptionWords) { + if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) { // avoid interruption if the new_transcript contains fewer words than minInterruptionWords this.cancelPreemptiveGeneration(); this.logger.info( diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index e581a8ce0..8054f4001 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -64,7 +64,10 @@ import type { UnknownUserData } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; import { RunResult } from './testing/run_result.js'; import type { InterruptionConfig } from './turn_config/interruption.js'; -import type { TurnHandlingConfig } from './turn_config/turn_handling.js'; +import type { + InternalTurnHandlingConfig, + TurnHandlingConfig, +} from './turn_config/turn_handling.js'; import { migrateLegacyOptions } from './turn_config/utils.js'; export interface SessionOptions { @@ -103,6 +106,10 @@ export interface SessionOptions { maxEndpointingDelay?: number; } +export interface InternalSessionOptions extends SessionOptions { + turnHandling: InternalTurnHandlingConfig; +} + export const defaultSessionOptions = { maxToolSteps: 3, preemptiveGeneration: false, @@ -153,7 +160,7 @@ export class AgentSession< tts?: TTS; turnDetection?: TurnDetectionMode; - readonly options: VoiceOptions; + readonly options: InternalSessionOptions; private agent?: Agent; private activity?: AgentActivity; diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index 1c0747ce3..d31778b07 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -12,12 +12,12 @@ export interface EndpointingConfig { * the STT provider's endpointing delay. * @defaultValue 0.5 */ - minDelay?: number; + minDelay: number; /** * Maximum time in seconds the agent will wait before terminating the turn. * @defaultValue 3.0 */ - maxDelay?: number; + maxDelay: number; } export const defaultEndpointingConfig = { diff --git a/agents/src/voice/turn_config/interruption.ts b/agents/src/voice/turn_config/interruption.ts index 50616a649..8005b520a 100644 --- a/agents/src/voice/turn_config/interruption.ts +++ b/agents/src/voice/turn_config/interruption.ts @@ -9,34 +9,34 @@ export interface InterruptionConfig { * Interruption handling strategy. * @defaultValue undefined */ - mode?: 'adaptive' | 'vad' | false; + mode: 'adaptive' | 'vad' | false | undefined; /** * When `true`, buffered audio is dropped while the agent is speaking and cannot be interrupted. * @defaultValue true */ - discardAudioIfUninterruptible?: boolean; + discardAudioIfUninterruptible: boolean; /** * Minimum speech length in seconds to register as an interruption. * @defaultValue 0.5 */ - minDuration?: number; + minDuration: number; /** * Minimum number of words to consider an interruption, only used if STT is enabled. * @defaultValue 0 */ - minWords?: number; + minWords: number; /** * If set, emit an `agentFalseInterruption` event after this amount of time if the user is * silent and no user transcript is detected after the interruption. Set to `undefined` to * disable. * @defaultValue 2.0 */ - falseInterruptionTimeout?: number; + falseInterruptionTimeout: number; /** * Whether to resume the false interruption after the `falseInterruptionTimeout`. * @defaultValue true */ - resumeFalseInterruption?: boolean; + resumeFalseInterruption: boolean; } export const defaultInterruptionConfig = { diff --git a/agents/src/voice/turn_config/turn_handling.ts b/agents/src/voice/turn_config/turn_handling.ts index 8505701de..827897421 100644 --- a/agents/src/voice/turn_config/turn_handling.ts +++ b/agents/src/voice/turn_config/turn_handling.ts @@ -26,14 +26,19 @@ export interface TurnHandlingConfig { /** * Configuration for endpointing. */ - endpointing: EndpointingConfig; + endpointing: Partial; /** * Configuration for interruption handling. */ + interruption: Partial; +} + +export interface InternalTurnHandlingConfig extends TurnHandlingConfig { + endpointing: EndpointingConfig; interruption: InterruptionConfig; } -export const defaultTurnHandlingConfig: TurnHandlingConfig = { +export const defaultTurnHandlingConfig: InternalTurnHandlingConfig = { turnDetection: undefined, interruption: defaultInterruptionConfig, endpointing: defaultEndpointingConfig, diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index 816038255..6696366a3 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -4,7 +4,7 @@ import { log } from '../../log.js'; import { type AgentSessionOptions, - type SessionOptions, + type InternalSessionOptions, defaultSessionOptions, } from '../agent_session.js'; import { defaultEndpointingConfig } from './endpointing.js'; @@ -13,7 +13,7 @@ import { type TurnHandlingConfig, defaultTurnHandlingConfig } from './turn_handl export function migrateLegacyOptions( legacyOptions: AgentSessionOptions, -): AgentSessionOptions & { options: SessionOptions } { +): AgentSessionOptions & { options: InternalSessionOptions } { const logger = log(); const { voiceOptions, turnDetection, options: sessionOptions, ...rest } = legacyOptions; @@ -50,7 +50,9 @@ export function migrateLegacyOptions( turnHandling: mergeWithDefaults(turnHandling), }; - const newAgentSessionOptions: AgentSessionOptions & { options: SessionOptions } = { + const newAgentSessionOptions: AgentSessionOptions & { + options: InternalSessionOptions; + } = { ...rest, options: optionsWithDefaults, voiceOptions: optionsWithDefaults, From 2ee27486407becb08cdba2ce26138421671eaad4 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 16:00:36 +0100 Subject: [PATCH 15/26] rabbit comments --- agents/src/voice/agent_session.ts | 12 ++++++------ agents/src/voice/audio_recognition.ts | 2 +- examples/src/basic_agent.ts | 19 +++++++++---------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 8054f4001..5f97dc30b 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -92,17 +92,17 @@ export interface SessionOptions { */ turnHandling: Partial; - /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.mode instead. */ + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.mode instead. */ allowInterruptions?: boolean; - /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */ + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */ discardAudioIfUninterruptible?: boolean; - /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.minDuration instead. */ + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.minDuration instead. */ minInterruptionDuration?: number; - /** @deprecated Use {@link VoiceOptions.turnHandling}.interruption.minWords instead. */ + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.minWords instead. */ minInterruptionWords?: number; - /** @deprecated Use {@link VoiceOptions.turnHandling}.endpointing.minDelay instead. */ + /** @deprecated Use {@link SessionOptions.turnHandling}.endpointing.minDelay instead. */ minEndpointingDelay?: number; - /** @deprecated Use {@link VoiceOptions.turnHandling}.endpointing.maxDelay instead. */ + /** @deprecated Use {@link SessionOptions.turnHandling}.endpointing.maxDelay instead. */ maxEndpointingDelay?: number; } diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index b6aa67887..788fa87c9 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -283,7 +283,7 @@ export class AudioRecognition { } const eventsToEmit = - emitFromIndex && shouldFlush !== undefined ? this.transcriptBuffer.slice(emitFromIndex) : []; + emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : []; this.transcriptBuffer = []; this.ignoreUserTranscriptUntil = undefined; diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index 8c7c36826..99e0c7591 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -58,20 +58,19 @@ export default defineAgent({ // See more at https://docs.livekit.io/agents/build/turns vad: ctx.proc.userData.vad! as silero.VAD, - turnHandling: { - turnDetection: new livekit.turnDetector.MultilingualModel(), - interruption: { - resumeFalseInterruption: true, - falseInterruptionTimeout: 1, - mode: 'adaptive', - }, - preemptiveGeneration: true, - }, // to use realtime model, replace the stt, llm, tts and vad with the following // llm: new openai.realtime.RealtimeModel(), - voiceOptions: { + options: { // allow the LLM to generate a response while waiting for the end of turn preemptiveGeneration: true, + turnHandling: { + turnDetection: new livekit.turnDetector.MultilingualModel(), + interruption: { + resumeFalseInterruption: true, + falseInterruptionTimeout: 1, + mode: 'adaptive', + }, + }, }, connOptions: { // Example of overriding the default connection options for the LLM/TTS/STT From 62cd4483864f45ced1a1ce7847b8ff5099a89b7b Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 16:25:18 +0100 Subject: [PATCH 16/26] remove unused stuff --- agents/src/inference/interruption/utils.ts | 23 ---------------------- agents/src/voice/audio_recognition.ts | 2 -- 2 files changed, 25 deletions(-) diff --git a/agents/src/inference/interruption/utils.ts b/agents/src/inference/interruption/utils.ts index 71037bc03..bd89dd512 100644 --- a/agents/src/inference/interruption/utils.ts +++ b/agents/src/inference/interruption/utils.ts @@ -36,29 +36,6 @@ export class BoundedCache { return this.cache.delete(key); } - /** - * Get existing entry and update it, or create a new one using factory. - * Updates the entry with the provided partial fields. - */ - setOrUpdate( - key: K, - factory: () => T, - updates: Partial<{ [P in keyof T]: T[P] }>, - ): T { - let entry = this.cache.get(key) as T | undefined; - if (entry === undefined) { - entry = factory(); - this.set(key, entry); - } - // Apply updates to the entry - for (const [field, value] of Object.entries(updates)) { - if (value !== undefined) { - (entry as Record)[field] = value; - } - } - return entry; - } - /** * Pop the last entry that matches the predicate, or return undefined. * Only removes and returns the matching entry, preserving others. diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 788fa87c9..743802374 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -894,12 +894,10 @@ export class AudioRecognition { const abortPromise = waitForAbort(signal); while (!signal.aborted) { - this.logger.warn('waiting for interruption event'); const res = await Promise.race([eventReader.read(), abortPromise]); if (!res) break; const { done, value: ev } = res; if (done) break; - this.logger.warn('got interruption event'); this.onInterruptionEvent(ev); } } catch (e) { From ec6d9bd2cf3fc3e7db7a1b21257f52143a50aa02 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 16:50:30 +0100 Subject: [PATCH 17/26] more rabbit fixes --- agents/src/voice/agent_activity.ts | 2 +- agents/src/voice/audio_recognition.ts | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 9e29e233e..885aec344 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -919,7 +919,7 @@ export class AgentActivity implements RecognitionHooks { this.logger.info( { wordCount, - minInterruptionWords: this.agentSession.options.minInterruptionWords, + minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords, }, 'skipping user input, word count below minimum interruption threshold', ); diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 743802374..bb802acf0 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -983,6 +983,7 @@ export class AudioRecognition { await this.sttTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); await this.interruptionStreamChannel.close(); } From 016e3a446b37921e080d166a8c28210451f016f3 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 16:53:36 +0100 Subject: [PATCH 18/26] better cleanup --- .../interruption/interruption_stream.ts | 1 + agents/src/stream/stream_channel.ts | 20 +++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/agents/src/inference/interruption/interruption_stream.ts b/agents/src/inference/interruption/interruption_stream.ts index 6409f77fb..ef0a22e89 100644 --- a/agents/src/inference/interruption/interruption_stream.ts +++ b/agents/src/inference/interruption/interruption_stream.ts @@ -371,6 +371,7 @@ export class InterruptionStreamBase { async close(): Promise { if (!this.inputStream.closed) await this.inputStream.close(); + this.model.removeStream(this); } } diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 67364e201..edaeaa856 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -21,23 +21,35 @@ export function createStreamChannel(): StreamChannel return { write: (chunk: T) => writer.write(chunk), stream: () => transform.readable, - abort: (error: E) => { + abort: async (error: E) => { + if (isClosed) return; isClosed = true; - return writer.abort(error); + try { + await writer.abort(error); + } catch (e) { + if (e instanceof Error && e.name === 'TypeError') return; + throw e; + } }, addStreamInput: (newInputStream) => { + if (isClosed) return; const reader = newInputStream.getReader(); (async () => { try { - while (true) { + while (!isClosed) { const { done, value } = await reader.read(); if (done) break; await writer.write(value); } + } catch (err) { + if (!isClosed) { + isClosed = true; + await writer.abort(err as E); + } } finally { reader.releaseLock(); } - })(); + })().catch(() => {}); }, close: async () => { try { From 9a4939c7daeb620d149e3d61b2ec258785cd3fd8 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 30 Jan 2026 17:04:34 +0100 Subject: [PATCH 19/26] ensure inputStartedAt is set --- agents/src/voice/audio_recognition.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index bb802acf0..feb92c6e3 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -876,6 +876,7 @@ export class AudioRecognition { if (!res) break; const { value, done } = res; if (done) break; + this.inputStartedAt = Date.now(); await stream.pushFrame(value); } } finally { From e28b1b118b1f15b073bfb2472693f54ad1af704e Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Tue, 3 Feb 2026 01:24:20 +0800 Subject: [PATCH 20/26] Fix Inference URL parity (#1011) --- agents/src/inference/interruption/defaults.ts | 30 +++++++++++++++++++ .../interruption/interruption_detector.ts | 9 ++++-- .../inference/interruption/ws_transport.ts | 11 +++---- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index a7f36e5b9..53f9e56bf 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -13,8 +13,38 @@ export const REMOTE_INFERENCE_TIMEOUT_IN_S = 1.0; export const SAMPLE_RATE = 16000; export const FRAMES_PER_SECOND = 40; export const FRAME_DURATION_IN_S = 0.025; // 25ms per frame + +/** Default production inference URL */ export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; +/** Staging inference URL */ +export const STAGING_BASE_URL = 'https://agent-gateway-staging.livekit.cloud/v1'; + +/** + * Get the default inference URL based on the environment. + * + * Priority: + * 1. LIVEKIT_INFERENCE_URL if set + * 2. If LIVEKIT_URL contains '.staging.livekit.cloud', use staging gateway + * 3. Otherwise, use production gateway + */ +export function getDefaultInferenceUrl(): string { + // Priority 1: LIVEKIT_INFERENCE_URL + const inferenceUrl = process.env.LIVEKIT_INFERENCE_URL; + if (inferenceUrl) { + return inferenceUrl; + } + + // Priority 2: Check LIVEKIT_URL for staging (exact match to Python) + const livekitUrl = process.env.LIVEKIT_URL || ''; + if (livekitUrl.includes('.staging.livekit.cloud')) { + return STAGING_BASE_URL; + } + + // Priority 3: Default to production + return DEFAULT_BASE_URL; +} + export const apiConnectDefaults: ApiConnectOptions = { maxRetries: 3, retryInterval: 2_000, diff --git a/agents/src/inference/interruption/interruption_detector.ts b/agents/src/inference/interruption/interruption_detector.ts index 2722ecf39..a5a457072 100644 --- a/agents/src/inference/interruption/interruption_detector.ts +++ b/agents/src/inference/interruption/interruption_detector.ts @@ -8,6 +8,8 @@ import { DEFAULT_BASE_URL, FRAMES_PER_SECOND, SAMPLE_RATE, + STAGING_BASE_URL, + getDefaultInferenceUrl, interruptionOptionDefaults, } from './defaults.js'; import type { InterruptionDetectionError } from './errors.js'; @@ -48,13 +50,14 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ throw new Error('maxAudioDurationInS must be less than or equal to 3.0 seconds'); } - const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? DEFAULT_BASE_URL; + const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? getDefaultInferenceUrl(); let lkApiKey = apiKey ?? ''; let lkApiSecret = apiSecret ?? ''; let useProxy: boolean; - // use LiveKit credentials if using the default base URL (inference) - if (lkBaseUrl === DEFAULT_BASE_URL) { + // Use LiveKit credentials if using the inference service (production or staging) + const isInferenceUrl = lkBaseUrl === DEFAULT_BASE_URL || lkBaseUrl === STAGING_BASE_URL; + if (isInferenceUrl) { lkApiKey = apiKey ?? process.env.LIVEKIT_INFERENCE_API_KEY ?? process.env.LIVEKIT_API_KEY ?? ''; if (!lkApiKey) { diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index d85a488dd..edf6d81a9 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -266,17 +266,18 @@ export function createWsTransport( } const state = getState(); - const createdAt = performance.now(); + // Use truncated timestamp consistently for both cache key and header + // This ensures the server's response created_at matches our cache key + const createdAt = Math.floor(performance.now()); - // Store the audio data in cache + // Store the audio data in cache with truncated timestamp state.cache.set(createdAt, new InterruptionCacheEntry({ createdAt, speechInput: audioSlice })); // Create header: 8-byte little-endian uint64 timestamp (milliseconds as integer) const header = new ArrayBuffer(8); const view = new DataView(header); - const createdAtInt = Math.floor(createdAt); - view.setUint32(0, createdAtInt >>> 0, true); - view.setUint32(4, Math.floor(createdAtInt / 0x100000000) >>> 0, true); + view.setUint32(0, createdAt >>> 0, true); + view.setUint32(4, Math.floor(createdAt / 0x100000000) >>> 0, true); // Combine header and audio data const audioBytes = new Uint8Array( From 4310baaeda419ca21bf5ca3ce17168d0571331f0 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Mon, 2 Feb 2026 10:06:07 -0800 Subject: [PATCH 21/26] Preserve turnDetection after cloning --- agents/src/voice/turn_config/utils.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts index 6696366a3..c8b8a0d27 100644 --- a/agents/src/voice/turn_config/utils.ts +++ b/agents/src/voice/turn_config/utils.ts @@ -23,10 +23,15 @@ export function migrateLegacyOptions( ); } + // Preserve turnDetection before cloning since structuredClone converts class instances to plain objects + const originalTurnDetection = + sessionOptions?.turnHandling?.turnDetection ?? + voiceOptions?.turnHandling?.turnDetection ?? + turnDetection; + const mergedOptions = structuredClone({ ...voiceOptions, ...sessionOptions }); const turnHandling: TurnHandlingConfig = { - turnDetection: turnDetection, interruption: { discardAudioIfUninterruptible: mergedOptions?.discardAudioIfUninterruptible, minDuration: mergedOptions?.minInterruptionDuration, @@ -38,6 +43,9 @@ export function migrateLegacyOptions( }, ...mergedOptions.turnHandling, + // Restore original turnDetection after spread to preserve class instance with methods + // (structuredClone converts class instances to plain objects, losing prototype methods) + turnDetection: originalTurnDetection, } as const; if (mergedOptions?.allowInterruptions === false) { From 175e57bd3e7a56951cab9918ce398155fd1c4ed9 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Mon, 2 Feb 2026 15:49:49 -0800 Subject: [PATCH 22/26] respect LIVEKIT_REMOTE_EOT_URL environment variable Remove baseUrl and useProxy from interruptionOptionDefaults so they are resolved dynamically in the constructor. Previously, the defaults pre-populated baseUrl with the cloud inference URL, which prevented the LIVEKIT_REMOTE_EOT_URL environment variable from being used. --- agents/src/inference/interruption/defaults.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 53f9e56bf..6e74d7691 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -66,7 +66,9 @@ export function intervalForRetry( return exponentialDelay + jitter; } -export const interruptionOptionDefaults: InterruptionOptions = { +// baseUrl and useProxy are resolved dynamically in the constructor +// to respect LIVEKIT_REMOTE_EOT_URL environment variable +export const interruptionOptionDefaults: Omit = { sampleRate: SAMPLE_RATE, threshold: THRESHOLD, minFrames: Math.ceil(MIN_INTERRUPTION_DURATION_IN_S * FRAMES_PER_SECOND), @@ -74,9 +76,7 @@ export const interruptionOptionDefaults: InterruptionOptions = { audioPrefixDurationInS: AUDIO_PREFIX_DURATION_IN_S, detectionIntervalInS: DETECTION_INTERVAL_IN_S, inferenceTimeout: 1_000, - baseUrl: DEFAULT_BASE_URL, apiKey: process.env.LIVEKIT_API_KEY || '', apiSecret: process.env.LIVEKIT_API_SECRET || '', - useProxy: false, minInterruptionDurationInS: MIN_INTERRUPTION_DURATION_IN_S, } as const; From 0682f25bf1b967efdbade9f2c76b085ef4dd9f0d Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Mon, 2 Feb 2026 16:16:53 -0800 Subject: [PATCH 23/26] refine timeout computation --- agents/src/inference/interruption/defaults.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 6e74d7691..66f2fe85f 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -75,7 +75,7 @@ export const interruptionOptionDefaults: Omit Date: Mon, 2 Feb 2026 17:59:43 -0800 Subject: [PATCH 24/26] save temp --- agents/src/llm/llm.ts | 16 + agents/src/llm/realtime.ts | 4 + agents/src/metrics/base.ts | 28 ++ agents/src/metrics/index.ts | 9 + agents/src/metrics/model_usage.test.ts | 545 +++++++++++++++++++++++++ agents/src/metrics/model_usage.ts | 227 ++++++++++ agents/src/metrics/usage_collector.ts | 16 + agents/src/stt/stt.ts | 38 ++ agents/src/telemetry/trace_types.ts | 17 + agents/src/telemetry/traces.ts | 4 + agents/src/tts/tts.ts | 70 +++- agents/src/voice/agent_activity.ts | 10 + agents/src/voice/agent_session.ts | 35 ++ agents/src/voice/audio_recognition.ts | 16 + agents/src/voice/generation.ts | 59 ++- agents/src/voice/report.ts | 7 + 16 files changed, 1098 insertions(+), 3 deletions(-) create mode 100644 agents/src/metrics/model_usage.test.ts create mode 100644 agents/src/metrics/model_usage.ts diff --git a/agents/src/llm/llm.ts b/agents/src/llm/llm.ts index 0ab158e6b..40055bd5c 100644 --- a/agents/src/llm/llm.ts +++ b/agents/src/llm/llm.ts @@ -65,6 +65,18 @@ export abstract class LLM extends (EventEmitter as new () => TypedEmitter { } return (usage?.completionTokens || 0) / (durationMs / 1000); })(), + metadata: { + modelProvider: this.#llm.provider, + modelName: this.#llm.model, + }, }; if (this.#llmRequestSpan) { diff --git a/agents/src/llm/realtime.ts b/agents/src/llm/realtime.ts index b1758eaf7..d02d86dab 100644 --- a/agents/src/llm/realtime.ts +++ b/agents/src/llm/realtime.ts @@ -68,6 +68,10 @@ export abstract class RealtimeModel { /** The model name/identifier used by this realtime model */ abstract get model(): string; + get provider(): string { + return 'unknown'; + } + abstract session(): RealtimeSession; abstract close(): Promise; diff --git a/agents/src/metrics/base.ts b/agents/src/metrics/base.ts index 7f6d6a0cc..3c533b949 100644 --- a/agents/src/metrics/base.ts +++ b/agents/src/metrics/base.ts @@ -2,6 +2,13 @@ // // SPDX-License-Identifier: Apache-2.0 +export type MetricsMetadata = { + /** The provider name (e.g., 'openai', 'anthropic'). */ + modelProvider?: string; + /** The model name (e.g., 'gpt-4o', 'claude-3-5-sonnet'). */ + modelName?: string; +}; + export type AgentMetrics = | STTMetrics | LLMMetrics @@ -26,6 +33,8 @@ export type LLMMetrics = { totalTokens: number; tokensPerSecond: number; speechId?: string; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type STTMetrics = { @@ -41,10 +50,16 @@ export type STTMetrics = { * The duration of the pushed audio in milliseconds. */ audioDurationMs: number; + /** Input audio tokens (for token-based billing). */ + inputTokens?: number; + /** Output text tokens (for token-based billing). */ + outputTokens?: number; /** * Whether the STT is streaming (e.g using websocket). */ streamed: boolean; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type TTSMetrics = { @@ -59,10 +74,17 @@ export type TTSMetrics = { /** Generated audio duration in milliseconds. */ audioDurationMs: number; cancelled: boolean; + /** Number of characters synthesized (for character-based billing). */ charactersCount: number; + /** Input text tokens (for token-based billing, e.g., OpenAI TTS). */ + inputTokens?: number; + /** Output audio tokens (for token-based billing, e.g., OpenAI TTS). */ + outputTokens?: number; streamed: boolean; segmentId?: string; speechId?: string; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type VADMetrics = { @@ -133,6 +155,10 @@ export type RealtimeModelMetrics = { * The duration of the response from created to done in milliseconds. */ durationMs: number; + /** + * The duration of the session connection in milliseconds (for session-based billing like xAI). + */ + sessionDurationMs?: number; /** * Time to first audio token in milliseconds. -1 if no audio token was sent. */ @@ -165,4 +191,6 @@ export type RealtimeModelMetrics = { * Details about the output tokens used in the Response. */ outputTokenDetails: RealtimeModelMetricsOutputTokenDetails; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; diff --git a/agents/src/metrics/index.ts b/agents/src/metrics/index.ts index f400a9638..c83a9fbff 100644 --- a/agents/src/metrics/index.ts +++ b/agents/src/metrics/index.ts @@ -6,10 +6,19 @@ export type { AgentMetrics, EOUMetrics, LLMMetrics, + MetricsMetadata, RealtimeModelMetrics, STTMetrics, TTSMetrics, VADMetrics, } from './base.js'; +export { + filterZeroValues, + ModelUsageCollector, + type LLMModelUsage, + type ModelUsage, + type STTModelUsage, + type TTSModelUsage, +} from './model_usage.js'; export { UsageCollector, type UsageSummary } from './usage_collector.js'; export { logMetrics } from './utils.js'; diff --git a/agents/src/metrics/model_usage.test.ts b/agents/src/metrics/model_usage.test.ts new file mode 100644 index 000000000..d2f983beb --- /dev/null +++ b/agents/src/metrics/model_usage.test.ts @@ -0,0 +1,545 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { beforeEach, describe, expect, it } from 'vitest'; +import type { LLMMetrics, RealtimeModelMetrics, STTMetrics, TTSMetrics } from './base.js'; +import { + type LLMModelUsage, + ModelUsageCollector, + type STTModelUsage, + type TTSModelUsage, + filterZeroValues, +} from './model_usage.js'; + +describe('model_usage', () => { + describe('filterZeroValues', () => { + it('should filter out zero values from LLMModelUsage', () => { + const usage: LLMModelUsage = { + type: 'llm_usage', + provider: 'openai', + model: 'gpt-4o', + inputTokens: 100, + inputCachedTokens: 0, + inputAudioTokens: 0, + inputCachedAudioTokens: 0, + inputTextTokens: 0, + inputCachedTextTokens: 0, + inputImageTokens: 0, + inputCachedImageTokens: 0, + outputTokens: 50, + outputAudioTokens: 0, + outputTextTokens: 0, + sessionDurationMs: 0, + }; + + const filtered = filterZeroValues(usage); + + expect(filtered.type).toBe('llm_usage'); + expect(filtered.provider).toBe('openai'); + expect(filtered.model).toBe('gpt-4o'); + expect(filtered.inputTokens).toBe(100); + expect(filtered.outputTokens).toBe(50); + // Zero values should be filtered out + expect(filtered.inputCachedTokens).toBeUndefined(); + expect(filtered.inputAudioTokens).toBeUndefined(); + expect(filtered.sessionDurationMs).toBeUndefined(); + }); + + it('should filter out zero values from TTSModelUsage', () => { + const usage: TTSModelUsage = { + type: 'tts_usage', + provider: 'elevenlabs', + model: 'eleven_turbo_v2', + inputTokens: 0, + outputTokens: 0, + charactersCount: 500, + audioDurationMs: 3000, + }; + + const filtered = filterZeroValues(usage); + + expect(filtered.type).toBe('tts_usage'); + expect(filtered.provider).toBe('elevenlabs'); + expect(filtered.charactersCount).toBe(500); + expect(filtered.audioDurationMs).toBe(3000); + expect(filtered.inputTokens).toBeUndefined(); + expect(filtered.outputTokens).toBeUndefined(); + }); + + it('should keep all values when none are zero', () => { + const usage: STTModelUsage = { + type: 'stt_usage', + provider: 'deepgram', + model: 'nova-2', + inputTokens: 10, + outputTokens: 20, + audioDurationMs: 5000, + }; + + const filtered = filterZeroValues(usage); + + expect(Object.keys(filtered)).toHaveLength(6); + expect(filtered).toEqual(usage); + }); + }); + + describe('ModelUsageCollector', () => { + let collector: ModelUsageCollector; + + beforeEach(() => { + collector = new ModelUsageCollector(); + }); + + describe('collect LLM metrics', () => { + it('should aggregate LLM metrics by provider and model', () => { + const metrics1: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 50, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const metrics2: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 150, + ttftMs: 60, + cancelled: false, + completionTokens: 150, + promptTokens: 300, + promptCachedTokens: 75, + totalTokens: 450, + tokensPerSecond: 12, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.type).toBe('llm_usage'); + expect(llmUsage.provider).toBe('openai'); + expect(llmUsage.model).toBe('gpt-4o'); + expect(llmUsage.inputTokens).toBe(500); // 200 + 300 + expect(llmUsage.inputCachedTokens).toBe(125); // 50 + 75 + expect(llmUsage.outputTokens).toBe(250); // 100 + 150 + }); + + it('should separate metrics by different providers', () => { + const openaiMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const anthropicMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 120, + ttftMs: 55, + cancelled: false, + completionTokens: 80, + promptTokens: 150, + promptCachedTokens: 0, + totalTokens: 230, + tokensPerSecond: 8, + metadata: { + modelProvider: 'anthropic', + modelName: 'claude-3-5-sonnet', + }, + }; + + collector.collect(openaiMetrics); + collector.collect(anthropicMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(2); + + const openaiUsage = usage.find( + (u) => u.type === 'llm_usage' && u.provider === 'openai', + ) as LLMModelUsage; + const anthropicUsage = usage.find( + (u) => u.type === 'llm_usage' && u.provider === 'anthropic', + ) as LLMModelUsage; + + expect(openaiUsage.inputTokens).toBe(200); + expect(openaiUsage.outputTokens).toBe(100); + expect(anthropicUsage.inputTokens).toBe(150); + expect(anthropicUsage.outputTokens).toBe(80); + }); + }); + + describe('collect TTS metrics', () => { + it('should aggregate TTS metrics by provider and model', () => { + const metrics1: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + ttfbMs: 100, + durationMs: 500, + audioDurationMs: 3000, + cancelled: false, + charactersCount: 100, + inputTokens: 10, + outputTokens: 20, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + const metrics2: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + ttfbMs: 120, + durationMs: 600, + audioDurationMs: 4000, + cancelled: false, + charactersCount: 200, + inputTokens: 15, + outputTokens: 25, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const ttsUsage = usage[0] as TTSModelUsage; + expect(ttsUsage.type).toBe('tts_usage'); + expect(ttsUsage.provider).toBe('elevenlabs'); + expect(ttsUsage.model).toBe('eleven_turbo_v2'); + expect(ttsUsage.charactersCount).toBe(300); // 100 + 200 + expect(ttsUsage.audioDurationMs).toBe(7000); // 3000 + 4000 + expect(ttsUsage.inputTokens).toBe(25); // 10 + 15 + expect(ttsUsage.outputTokens).toBe(45); // 20 + 25 + }); + }); + + describe('collect STT metrics', () => { + it('should aggregate STT metrics by provider and model', () => { + const metrics1: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 5000, + inputTokens: 50, + outputTokens: 100, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + const metrics2: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 3000, + inputTokens: 30, + outputTokens: 60, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const sttUsage = usage[0] as STTModelUsage; + expect(sttUsage.type).toBe('stt_usage'); + expect(sttUsage.provider).toBe('deepgram'); + expect(sttUsage.model).toBe('nova-2'); + expect(sttUsage.audioDurationMs).toBe(8000); // 5000 + 3000 + expect(sttUsage.inputTokens).toBe(80); // 50 + 30 + expect(sttUsage.outputTokens).toBe(160); // 100 + 60 + }); + }); + + describe('collect realtime model metrics', () => { + it('should aggregate realtime model metrics with detailed token breakdown', () => { + const metrics: RealtimeModelMetrics = { + type: 'realtime_model_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 1000, + ttftMs: 100, + cancelled: false, + inputTokens: 500, + outputTokens: 300, + totalTokens: 800, + tokensPerSecond: 10, + sessionDurationMs: 5000, + inputTokenDetails: { + audioTokens: 200, + textTokens: 250, + imageTokens: 50, + cachedTokens: 100, + cachedTokensDetails: { + audioTokens: 30, + textTokens: 50, + imageTokens: 20, + }, + }, + outputTokenDetails: { + textTokens: 200, + audioTokens: 100, + imageTokens: 0, + }, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o-realtime', + }, + }; + + collector.collect(metrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.type).toBe('llm_usage'); + expect(llmUsage.provider).toBe('openai'); + expect(llmUsage.model).toBe('gpt-4o-realtime'); + expect(llmUsage.inputTokens).toBe(500); + expect(llmUsage.inputCachedTokens).toBe(100); + expect(llmUsage.inputAudioTokens).toBe(200); + expect(llmUsage.inputCachedAudioTokens).toBe(30); + expect(llmUsage.inputTextTokens).toBe(250); + expect(llmUsage.inputCachedTextTokens).toBe(50); + expect(llmUsage.inputImageTokens).toBe(50); + expect(llmUsage.inputCachedImageTokens).toBe(20); + expect(llmUsage.outputTokens).toBe(300); + expect(llmUsage.outputTextTokens).toBe(200); + expect(llmUsage.outputAudioTokens).toBe(100); + expect(llmUsage.sessionDurationMs).toBe(5000); + }); + }); + + describe('mixed metrics collection', () => { + it('should collect and separate LLM, TTS, and STT metrics', () => { + const llmMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const ttsMetrics: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + ttfbMs: 100, + durationMs: 500, + audioDurationMs: 3000, + cancelled: false, + charactersCount: 100, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + const sttMetrics: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req3', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 5000, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + collector.collect(llmMetrics); + collector.collect(ttsMetrics); + collector.collect(sttMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(3); + + const llmUsage = usage.find((u) => u.type === 'llm_usage'); + const ttsUsage = usage.find((u) => u.type === 'tts_usage'); + const sttUsage = usage.find((u) => u.type === 'stt_usage'); + + expect(llmUsage).toBeDefined(); + expect(ttsUsage).toBeDefined(); + expect(sttUsage).toBeDefined(); + }); + }); + + describe('flatten returns copies', () => { + it('should return deep copies of usage objects', () => { + const metrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + collector.collect(metrics); + + const usage1 = collector.flatten(); + const usage2 = collector.flatten(); + + // Should be equal values + expect(usage1[0]).toEqual(usage2[0]); + + // But not the same object reference + expect(usage1[0]).not.toBe(usage2[0]); + + // Modifying one shouldn't affect the other + (usage1[0] as LLMModelUsage).inputTokens = 9999; + expect((usage2[0] as LLMModelUsage).inputTokens).toBe(200); + }); + }); + + describe('handles missing metadata', () => { + it('should use empty strings when metadata is missing', () => { + const metrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + // No metadata + }; + + collector.collect(metrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.provider).toBe(''); + expect(llmUsage.model).toBe(''); + }); + }); + + describe('ignores VAD and EOU metrics', () => { + it('should not collect VAD metrics', () => { + const vadMetrics = { + type: 'vad_metrics' as const, + label: 'test', + timestamp: Date.now(), + idleTimeMs: 100, + inferenceDurationTotalMs: 50, + inferenceCount: 10, + }; + + collector.collect(vadMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(0); + }); + + it('should not collect EOU metrics', () => { + const eouMetrics = { + type: 'eou_metrics' as const, + timestamp: Date.now(), + endOfUtteranceDelayMs: 100, + transcriptionDelayMs: 50, + onUserTurnCompletedDelayMs: 30, + lastSpeakingTimeMs: Date.now(), + }; + + collector.collect(eouMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(0); + }); + }); + }); +}); diff --git a/agents/src/metrics/model_usage.ts b/agents/src/metrics/model_usage.ts new file mode 100644 index 000000000..d90ed7123 --- /dev/null +++ b/agents/src/metrics/model_usage.ts @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { + AgentMetrics, + LLMMetrics, + RealtimeModelMetrics, + STTMetrics, + TTSMetrics, +} from './base.js'; + +export type LLMModelUsage = { + type: 'llm_usage'; + /** The provider name (e.g., 'openai', 'anthropic'). */ + provider: string; + /** The model name (e.g., 'gpt-4o', 'claude-3-5-sonnet'). */ + model: string; + /** Total input tokens. */ + inputTokens: number; + /** Input tokens served from cache. */ + inputCachedTokens: number; + /** Input audio tokens (for multimodal models). */ + inputAudioTokens: number; + /** Cached input audio tokens. */ + inputCachedAudioTokens: number; + /** Input text tokens. */ + inputTextTokens: number; + /** Cached input text tokens. */ + inputCachedTextTokens: number; + /** Input image tokens (for multimodal models). */ + inputImageTokens: number; + /** Cached input image tokens. */ + inputCachedImageTokens: number; + /** Total output tokens. */ + outputTokens: number; + /** Output audio tokens (for multimodal models). */ + outputAudioTokens: number; + /** Output text tokens. */ + outputTextTokens: number; + /** Total session connection duration in milliseconds (for session-based billing like xAI). */ + sessionDurationMs: number; +}; + +export type TTSModelUsage = { + type: 'tts_usage'; + /** The provider name (e.g., 'elevenlabs', 'cartesia'). */ + provider: string; + /** The model name (e.g., 'eleven_turbo_v2', 'sonic'). */ + model: string; + /** Input text tokens (for token-based TTS billing, e.g., OpenAI TTS). */ + inputTokens: number; + /** Output audio tokens (for token-based TTS billing, e.g., OpenAI TTS). */ + outputTokens: number; + /** Number of characters synthesized (for character-based TTS billing). */ + charactersCount: number; + /** + * Duration of generated audio in milliseconds. + */ + audioDurationMs: number; +}; + +export type STTModelUsage = { + type: 'stt_usage'; + /** The provider name (e.g., 'deepgram', 'assemblyai'). */ + provider: string; + /** The model name (e.g., 'nova-2', 'best'). */ + model: string; + /** Input audio tokens (for token-based STT billing). */ + inputTokens: number; + /** Output text tokens (for token-based STT billing). */ + outputTokens: number; + /** Duration of processed audio in milliseconds. */ + audioDurationMs: number; +}; + +export type ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage; + +export function filterZeroValues(usage: T): Partial { + const result: Partial = {} as Partial; + for (const [key, value] of Object.entries(usage)) { + if (value !== 0 && value !== 0.0) { + (result as Record)[key] = value; + } + } + return result; +} + +export class ModelUsageCollector { + private llmUsage: Map = new Map(); + private ttsUsage: Map = new Map(); + private sttUsage: Map = new Map(); + + /** Extract provider and model from metrics metadata. */ + private extractProviderModel( + metrics: LLMMetrics | STTMetrics | TTSMetrics | RealtimeModelMetrics, + ): [string, string] { + let provider = ''; + let model = ''; + if (metrics.metadata) { + provider = metrics.metadata.modelProvider || ''; + model = metrics.metadata.modelName || ''; + } + return [provider, model]; + } + + /** Get or create an LLMModelUsage for the given provider/model combination. */ + private getLLMUsage(provider: string, model: string): LLMModelUsage { + const key = `${provider}:${model}`; + let usage = this.llmUsage.get(key); + if (!usage) { + usage = { + type: 'llm_usage', + provider, + model, + inputTokens: 0, + inputCachedTokens: 0, + inputAudioTokens: 0, + inputCachedAudioTokens: 0, + inputTextTokens: 0, + inputCachedTextTokens: 0, + inputImageTokens: 0, + inputCachedImageTokens: 0, + outputTokens: 0, + outputAudioTokens: 0, + outputTextTokens: 0, + sessionDurationMs: 0, + }; + this.llmUsage.set(key, usage); + } + return usage; + } + + /** Get or create a TTSModelUsage for the given provider/model combination. */ + private getTTSUsage(provider: string, model: string): TTSModelUsage { + const key = `${provider}:${model}`; + let usage = this.ttsUsage.get(key); + if (!usage) { + usage = { + type: 'tts_usage', + provider, + model, + inputTokens: 0, + outputTokens: 0, + charactersCount: 0, + audioDurationMs: 0, + }; + this.ttsUsage.set(key, usage); + } + return usage; + } + + /** Get or create an STTModelUsage for the given provider/model combination. */ + private getSTTUsage(provider: string, model: string): STTModelUsage { + const key = `${provider}:${model}`; + let usage = this.sttUsage.get(key); + if (!usage) { + usage = { + type: 'stt_usage', + provider, + model, + inputTokens: 0, + outputTokens: 0, + audioDurationMs: 0, + }; + this.sttUsage.set(key, usage); + } + return usage; + } + + /** Collect metrics and aggregate usage by model/provider. */ + collect(metrics: AgentMetrics): void { + if (metrics.type === 'llm_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getLLMUsage(provider, model); + usage.inputTokens += metrics.promptTokens; + usage.inputCachedTokens += metrics.promptCachedTokens; + usage.outputTokens += metrics.completionTokens; + } else if (metrics.type === 'realtime_model_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getLLMUsage(provider, model); + usage.inputTokens += metrics.inputTokens; + usage.inputCachedTokens += metrics.inputTokenDetails.cachedTokens; + + usage.inputTextTokens += metrics.inputTokenDetails.textTokens; + usage.inputCachedTextTokens += metrics.inputTokenDetails.cachedTokensDetails?.textTokens ?? 0; + usage.inputImageTokens += metrics.inputTokenDetails.imageTokens; + usage.inputCachedImageTokens += + metrics.inputTokenDetails.cachedTokensDetails?.imageTokens ?? 0; + usage.inputAudioTokens += metrics.inputTokenDetails.audioTokens; + usage.inputCachedAudioTokens += + metrics.inputTokenDetails.cachedTokensDetails?.audioTokens ?? 0; + + usage.outputTextTokens += metrics.outputTokenDetails.textTokens; + usage.outputAudioTokens += metrics.outputTokenDetails.audioTokens; + usage.outputTokens += metrics.outputTokens; + usage.sessionDurationMs += metrics.sessionDurationMs ?? 0; + } else if (metrics.type === 'tts_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const ttsUsage = this.getTTSUsage(provider, model); + ttsUsage.inputTokens += metrics.inputTokens ?? 0; + ttsUsage.outputTokens += metrics.outputTokens ?? 0; + ttsUsage.charactersCount += metrics.charactersCount; + ttsUsage.audioDurationMs += metrics.audioDurationMs; + } else if (metrics.type === 'stt_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const sttUsage = this.getSTTUsage(provider, model); + sttUsage.inputTokens += metrics.inputTokens ?? 0; + sttUsage.outputTokens += metrics.outputTokens ?? 0; + sttUsage.audioDurationMs += metrics.audioDurationMs; + } + // VAD and EOU metrics are not aggregated for usage tracking. + } + + flatten(): ModelUsage[] { + const result: ModelUsage[] = []; + for (const u of this.llmUsage.values()) { + result.push({ ...u }); + } + for (const u of this.ttsUsage.values()) { + result.push({ ...u }); + } + for (const u of this.sttUsage.values()) { + result.push({ ...u }); + } + return result; + } +} diff --git a/agents/src/metrics/usage_collector.ts b/agents/src/metrics/usage_collector.ts index c7f0e6c3d..74edc2d14 100644 --- a/agents/src/metrics/usage_collector.ts +++ b/agents/src/metrics/usage_collector.ts @@ -1,8 +1,18 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import { log } from '../log.js'; import type { AgentMetrics } from './base.js'; +// Ref: python livekit-agents/livekit/agents/metrics/usage_collector.py - lines 10-14 (diff) +// NOTE: Python uses warnings.warn() for deprecation at runtime. +// TypeScript uses JSDoc @deprecated which shows in IDE. +// We also add optional console.warn() in constructor for runtime parity. +/** + * @deprecated Use LLMModelUsage, TTSModelUsage, or STTModelUsage from './model_usage.js' instead. + * These new types provide per-model/provider usage aggregation for more detailed tracking. + * Ref: python livekit-agents/livekit/agents/metrics/usage_collector.py - lines 10-14 (diff) + */ export interface UsageSummary { llmPromptTokens: number; llmPromptCachedTokens: number; @@ -11,10 +21,16 @@ export interface UsageSummary { sttAudioDurationMs: number; } +/** + * @deprecated Use ModelUsageCollector from './model_usage.js' instead. + * ModelUsageCollector provides per-model/provider usage aggregation for more detailed tracking. + */ export class UsageCollector { private summary: UsageSummary; + private logger = log(); constructor() { + this.logger.warn('UsageCollector is deprecated. Use ModelUsageCollector instead.'); this.summary = { llmPromptTokens: 0, llmPromptCachedTokens: 0, diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts index 48c689ba2..523689d5e 100644 --- a/agents/src/stt/stt.ts +++ b/agents/src/stt/stt.ts @@ -59,6 +59,10 @@ export interface SpeechData { export interface RecognitionUsage { audioDuration: number; + /** Input audio tokens (for token-based STT billing). */ + inputTokens?: number; + /** Output text tokens (for token-based STT billing). */ + outputTokens?: number; } /** SpeechEvent is a packet of speech-to-text data. */ @@ -121,6 +125,30 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter { const startTime = process.hrtime.bigint(); @@ -134,6 +162,10 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter durationMs: 0, label: this.#stt.label, audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000), + inputTokens: event.recognitionUsage!.inputTokens ?? 0, + outputTokens: event.recognitionUsage!.outputTokens ?? 0, streamed: true, + metadata: { + modelProvider: this.#stt.provider, + modelName: this.#stt.model, + }, }; this.#stt.emit('metrics_collected', metrics); } diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index 7220ec03a..878b883b5 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -30,6 +30,11 @@ export const ATTR_FUNCTION_TOOLS = 'lk.function_tools'; export const ATTR_RESPONSE_TEXT = 'lk.response.text'; export const ATTR_RESPONSE_FUNCTION_CALLS = 'lk.response.function_calls'; +// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 5-6 (diff) +// New latency attributes for response timing +/** Time to first token in seconds. */ +export const ATTR_RESPONSE_TTFT = 'lk.response.ttft'; // Ref: line 5 (ATTR_RESPONSE_TTFT) + // function tool export const ATTR_FUNCTION_TOOL_NAME = 'lk.function_tool.name'; export const ATTR_FUNCTION_TOOL_ARGS = 'lk.function_tool.arguments'; @@ -41,6 +46,10 @@ export const ATTR_TTS_INPUT_TEXT = 'lk.input_text'; export const ATTR_TTS_STREAMING = 'lk.tts.streaming'; export const ATTR_TTS_LABEL = 'lk.tts.label'; +// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 10-11 (diff) +/** Time to first byte in seconds. */ +export const ATTR_RESPONSE_TTFB = 'lk.response.ttfb'; // Ref: line 10 (ATTR_RESPONSE_TTFB) + // eou detection export const ATTR_EOU_PROBABILITY = 'lk.eou.probability'; export const ATTR_EOU_UNLIKELY_THRESHOLD = 'lk.eou.unlikely_threshold'; @@ -63,10 +72,18 @@ export const ATTR_LLM_METRICS = 'lk.llm_metrics'; export const ATTR_TTS_METRICS = 'lk.tts_metrics'; export const ATTR_REALTIME_MODEL_METRICS = 'lk.realtime_model_metrics'; +// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 16-17 (diff) +// latency span attributes +/** End-to-end latency in seconds. */ +export const ATTR_E2E_LATENCY = 'lk.e2e_latency'; // Ref: line 17 (ATTR_E2E_LATENCY) + // OpenTelemetry GenAI attributes // OpenTelemetry specification: https://opentelemetry.io/docs/specs/semconv/registry/attributes/gen-ai/ export const ATTR_GEN_AI_OPERATION_NAME = 'gen_ai.operation.name'; export const ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model'; +// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 22-23 (diff) +/** The provider name (e.g., 'openai', 'anthropic'). */ +export const ATTR_GEN_AI_PROVIDER_NAME = 'gen_ai.provider.name'; // Ref: line 23 export const ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens'; export const ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens'; diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts index 28ef4c746..6f39ba427 100644 --- a/agents/src/telemetry/traces.ts +++ b/agents/src/telemetry/traces.ts @@ -24,6 +24,7 @@ import { AccessToken } from 'livekit-server-sdk'; import fs from 'node:fs/promises'; import type { ChatContent, ChatItem } from '../llm/index.js'; import { enableOtelLogging } from '../log.js'; +import { filterZeroValues } from '../metrics/model_usage.js'; import type { SessionReport } from '../voice/report.js'; import { type SimpleLogRecord, SimpleOTLPHttpLogExporter } from './otel_http_exporter.js'; import { flushPinoLogs, initPinoCloudExporter } from './pino_otel_transport.js'; @@ -445,6 +446,8 @@ export async function uploadSessionReport(options: { 'logger.name': 'chat_history', }; + const usage = report.modelUsage?.map(filterZeroValues) || null; + logRecords.push({ body: 'session report', timestampMs: report.startedAt || report.timestamp || 0, @@ -453,6 +456,7 @@ export async function uploadSessionReport(options: { 'session.options': report.options || {}, 'session.report_timestamp': report.timestamp, agent_name: agentName, + usage, }, }); diff --git a/agents/src/tts/tts.ts b/agents/src/tts/tts.ts index 8ee46515a..2595451da 100644 --- a/agents/src/tts/tts.ts +++ b/agents/src/tts/tts.ts @@ -87,6 +87,30 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter; #ttsRequestSpan?: Span; + #inputTokens = 0; + #outputTokens = 0; constructor(tts: TTS, connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS) { this.#tts = tts; @@ -275,6 +301,18 @@ export abstract class SynthesizeStream } } + /** + * Set token usage for token-based TTS billing (e.g., OpenAI TTS). + * Plugins should call this method to report token usage. + */ + protected setTokenUsage({ + inputTokens = 0, + outputTokens = 0, + }: { inputTokens?: number; outputTokens?: number } = {}): void { + this.#inputTokens = inputTokens; + this.#outputTokens = outputTokens; + } + protected async monitorMetrics() { const startTime = process.hrtime.bigint(); let audioDurationMs = 0; @@ -296,12 +334,22 @@ export abstract class SynthesizeStream audioDurationMs: roundedAudioDurationMs, cancelled: this.abortController.signal.aborted, label: this.#tts.label, - streamed: false, + inputTokens: this.#inputTokens, + outputTokens: this.#outputTokens, + streamed: true, + metadata: { + modelProvider: this.#tts.provider, + modelName: this.#tts.model, + }, }; if (this.#ttsRequestSpan) { this.#ttsRequestSpan.setAttribute(traceTypes.ATTR_TTS_METRICS, JSON.stringify(metrics)); } this.#tts.emit('metrics_collected', metrics); + + // Reset token usage after emitting metrics for the next segment + this.#inputTokens = 0; + this.#outputTokens = 0; } }; @@ -425,6 +473,8 @@ export abstract class ChunkedStream implements AsyncIterableIterator; private logger = log(); @@ -128,6 +132,8 @@ export class AudioRecognition { this.maxEndpointingDelay = opts.maxEndpointingDelay; this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; + this.sttModel = opts.sttModel; + this.sttProvider = opts.sttProvider; this.deferredInputStream = new DeferredReadableStream(); const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee(); @@ -804,6 +810,16 @@ export class AudioRecognition { context: this.rootSpanContext, startTime, }); + + if (this.sttModel) { + this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel); + } + if (this.sttProvider) { + this.userTurnSpan.setAttribute( + traceTypes.ATTR_GEN_AI_PROVIDER_NAME, + this.sttProvider, + ); + } } // Capture sample rate from the first VAD event if not already set diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index 06867c43d..0eab02545 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -36,6 +36,8 @@ export class _LLMGenerationData { generatedText: string = ''; generatedToolCalls: FunctionCall[]; id: string; + // Time to first token in seconds (for TTFT span attribute) + ttft?: number; constructor( public readonly textStream: ReadableStream, @@ -380,12 +382,16 @@ export function updateInstructions(options: { } } +// Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 3-7 (diff) +// Added model and provider parameters to generation functions export function performLLMInference( node: LLMNode, chatCtx: ChatContext, toolCtx: ToolContext, modelSettings: ModelSettings, controller: AbortController, + model?: string, // Ref: line 5 (model: str | None = None) + provider?: string, // Ref: line 6 (provider: str | None = None) ): [Task, _LLMGenerationData] { const textStream = new IdentityTransform(); const toolCallStream = new IdentityTransform(); @@ -401,8 +407,22 @@ export function performLLMInference( ); span.setAttribute(traceTypes.ATTR_FUNCTION_TOOLS, JSON.stringify(Object.keys(toolCtx))); + // Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 36-48 (diff) + // Set model/provider attributes on the span + if (model) { + // Ref: lines 44-45 + span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); + } + if (provider) { + // Ref: lines 46-47 + span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); + } + let llmStreamReader: ReadableStreamDefaultReader | null = null; let llmStream: ReadableStream | null = null; + // Track start time for TTFT calculation + const startTime = performance.now() / 1000; // Convert to seconds + let firstTokenReceived = false; try { llmStream = await node(chatCtx, toolCtx, modelSettings); @@ -425,6 +445,12 @@ export function performLLMInference( const { done, value: chunk } = result; if (done) break; + // Track time to first token + if (!firstTokenReceived) { + firstTokenReceived = true; + data.ttft = performance.now() / 1000 - startTime; + } + if (typeof chunk === 'string') { data.generatedText += chunk; await textWriter.write(chunk); @@ -463,6 +489,9 @@ export function performLLMInference( } span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, data.generatedText); + if (data.ttft !== undefined) { + span.setAttribute(traceTypes.ATTR_RESPONSE_TTFT, data.ttft); + } } catch (error) { if (error instanceof DOMException && error.name === 'AbortError') { // Abort signal was triggered, handle gracefully @@ -492,19 +521,37 @@ export function performLLMInference( ]; } +// Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 77-82 (diff) +// Added model and provider parameters for TTS generation export function performTTSInference( node: TTSNode, text: ReadableStream, modelSettings: ModelSettings, controller: AbortController, + model?: string, // Ref: line 79 (model: str | None = None) + provider?: string, // Ref: line 80 (provider: str | None = None) ): [Task, ReadableStream] { const audioStream = new IdentityTransform(); const outputWriter = audioStream.writable.getWriter(); const audioOutputStream = audioStream.readable; - const _performTTSInferenceImpl = async (signal: AbortSignal) => { + const _performTTSInferenceImpl = async (signal: AbortSignal, span: Span) => { + // Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 77-82 (diff) + // Set model/provider attributes on the span + if (model) { + // Ref: lines 79-80 + span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); + } + if (provider) { + // Ref: lines 81-82 + span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); + } + let ttsStreamReader: ReadableStreamDefaultReader | null = null; let ttsStream: ReadableStream | null = null; + // Track start time for TTFB calculation + const startTime = performance.now() / 1000; // Convert to seconds + let firstByteReceived = false; try { ttsStream = await node(text, modelSettings); @@ -522,6 +569,14 @@ export function performTTSInference( if (done) { break; } + + // Track time to first byte and set span attribute + if (!firstByteReceived) { + firstByteReceived = true; + const ttfb = performance.now() / 1000 - startTime; + span.setAttribute(traceTypes.ATTR_RESPONSE_TTFB, ttfb); + } + await outputWriter.write(chunk); } } catch (error) { @@ -541,7 +596,7 @@ export function performTTSInference( const currentContext = otelContext.active(); const inferenceTask = async (signal: AbortSignal) => - tracer.startActiveSpan(async () => _performTTSInferenceImpl(signal), { + tracer.startActiveSpan(async (span) => _performTTSInferenceImpl(signal, span), { name: 'tts_node', context: currentContext, }); diff --git a/agents/src/voice/report.ts b/agents/src/voice/report.ts index 49701a696..b18c1e795 100644 --- a/agents/src/voice/report.ts +++ b/agents/src/voice/report.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import type { ChatContext } from '../llm/chat_context.js'; +import { type ModelUsage, filterZeroValues } from '../metrics/model_usage.js'; import type { VoiceOptions } from './agent_session.js'; import type { AgentEvent } from './events.js'; @@ -23,6 +24,8 @@ export interface SessionReport { audioRecordingStartedAt?: number; /** Duration of the session in milliseconds */ duration?: number; + /** Usage summaries for the session, one per model/provider combination */ + modelUsage?: ModelUsage[]; } export interface SessionReportOptions { @@ -41,6 +44,8 @@ export interface SessionReportOptions { audioRecordingPath?: string; /** Timestamp when the audio recording started (milliseconds) */ audioRecordingStartedAt?: number; + /** Usage summaries for the session, one per model/provider combination */ + modelUsage?: ModelUsage[]; } export function createSessionReport(opts: SessionReportOptions): SessionReport { @@ -61,6 +66,7 @@ export function createSessionReport(opts: SessionReportOptions): SessionReport { audioRecordingStartedAt, duration: audioRecordingStartedAt !== undefined ? timestamp - audioRecordingStartedAt : undefined, + modelUsage: opts.modelUsage, }; } @@ -96,5 +102,6 @@ export function sessionReportToJSON(report: SessionReport): Record Date: Mon, 2 Feb 2026 18:04:04 -0800 Subject: [PATCH 25/26] fix comments --- agents/src/metrics/usage_collector.ts | 9 ++------- agents/src/telemetry/trace_types.ts | 13 ++++--------- agents/src/voice/agent_session.ts | 18 +----------------- agents/src/voice/generation.ts | 27 ++++----------------------- 4 files changed, 11 insertions(+), 56 deletions(-) diff --git a/agents/src/metrics/usage_collector.ts b/agents/src/metrics/usage_collector.ts index 74edc2d14..c815c8394 100644 --- a/agents/src/metrics/usage_collector.ts +++ b/agents/src/metrics/usage_collector.ts @@ -4,14 +4,9 @@ import { log } from '../log.js'; import type { AgentMetrics } from './base.js'; -// Ref: python livekit-agents/livekit/agents/metrics/usage_collector.py - lines 10-14 (diff) -// NOTE: Python uses warnings.warn() for deprecation at runtime. -// TypeScript uses JSDoc @deprecated which shows in IDE. -// We also add optional console.warn() in constructor for runtime parity. /** - * @deprecated Use LLMModelUsage, TTSModelUsage, or STTModelUsage from './model_usage.js' instead. + * @deprecated Use LLMModelUsage, TTSModelUsage, or STTModelUsage instead. * These new types provide per-model/provider usage aggregation for more detailed tracking. - * Ref: python livekit-agents/livekit/agents/metrics/usage_collector.py - lines 10-14 (diff) */ export interface UsageSummary { llmPromptTokens: number; @@ -22,7 +17,7 @@ export interface UsageSummary { } /** - * @deprecated Use ModelUsageCollector from './model_usage.js' instead. + * @deprecated Use ModelUsageCollector instead. * ModelUsageCollector provides per-model/provider usage aggregation for more detailed tracking. */ export class UsageCollector { diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index 878b883b5..3a0afbd0a 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -30,10 +30,9 @@ export const ATTR_FUNCTION_TOOLS = 'lk.function_tools'; export const ATTR_RESPONSE_TEXT = 'lk.response.text'; export const ATTR_RESPONSE_FUNCTION_CALLS = 'lk.response.function_calls'; -// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 5-6 (diff) // New latency attributes for response timing /** Time to first token in seconds. */ -export const ATTR_RESPONSE_TTFT = 'lk.response.ttft'; // Ref: line 5 (ATTR_RESPONSE_TTFT) +export const ATTR_RESPONSE_TTFT = 'lk.response.ttft'; // function tool export const ATTR_FUNCTION_TOOL_NAME = 'lk.function_tool.name'; @@ -46,9 +45,8 @@ export const ATTR_TTS_INPUT_TEXT = 'lk.input_text'; export const ATTR_TTS_STREAMING = 'lk.tts.streaming'; export const ATTR_TTS_LABEL = 'lk.tts.label'; -// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 10-11 (diff) /** Time to first byte in seconds. */ -export const ATTR_RESPONSE_TTFB = 'lk.response.ttfb'; // Ref: line 10 (ATTR_RESPONSE_TTFB) +export const ATTR_RESPONSE_TTFB = 'lk.response.ttfb'; // eou detection export const ATTR_EOU_PROBABILITY = 'lk.eou.probability'; @@ -72,18 +70,15 @@ export const ATTR_LLM_METRICS = 'lk.llm_metrics'; export const ATTR_TTS_METRICS = 'lk.tts_metrics'; export const ATTR_REALTIME_MODEL_METRICS = 'lk.realtime_model_metrics'; -// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 16-17 (diff) -// latency span attributes /** End-to-end latency in seconds. */ -export const ATTR_E2E_LATENCY = 'lk.e2e_latency'; // Ref: line 17 (ATTR_E2E_LATENCY) +export const ATTR_E2E_LATENCY = 'lk.e2e_latency'; // OpenTelemetry GenAI attributes // OpenTelemetry specification: https://opentelemetry.io/docs/specs/semconv/registry/attributes/gen-ai/ export const ATTR_GEN_AI_OPERATION_NAME = 'gen_ai.operation.name'; export const ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model'; -// Ref: python livekit-agents/livekit/agents/telemetry/trace_types.py - lines 22-23 (diff) /** The provider name (e.g., 'openai', 'anthropic'). */ -export const ATTR_GEN_AI_PROVIDER_NAME = 'gen_ai.provider.name'; // Ref: line 23 +export const ATTR_GEN_AI_PROVIDER_NAME = 'gen_ai.provider.name'; export const ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens'; export const ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens'; diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index d20861b0e..f81b40d60 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -17,7 +17,6 @@ import { } from '../inference/index.js'; import type { InterruptionEvent } from '../inference/interruption/types.js'; import { type JobContext, getJobContext } from '../job.js'; -// Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 5-6 (diff) import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js'; @@ -72,15 +71,9 @@ import type { } from './turn_config/turn_handling.js'; import { migrateLegacyOptions } from './turn_config/utils.js'; -// Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 17-19 (diff) -// NOTE: Python uses @dataclass. TypeScript uses interface. -/** - * Usage summary for an AgentSession, aggregated per model/provider combination. - * Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 17-19 (diff) - */ export interface AgentSessionUsage { /** List of usage summaries, one per model/provider combination. */ - modelUsage: ModelUsage[]; // Ref: line 18 (model_usage: list[ModelUsage]) + modelUsage: ModelUsage[]; } export interface SessionOptions { @@ -206,8 +199,6 @@ export class AgentSession< private _interruptionDetection?: InterruptionConfig['mode']; - // Ref: python livekit-agents/livekit/agents/voice/agent_session.py - line 34 (diff) - // Collects and aggregates usage metrics per model/provider combination private _usageCollector: ModelUsageCollector = new ModelUsageCollector(); /** @internal */ @@ -290,9 +281,6 @@ export class AgentSession< ): boolean { const eventData = args[0] as AgentEvent; this._recordedEvents.push(eventData); - // Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 39-40 (diff) - // if isinstance(arg, MetricsCollectedEvent): - // self._usage_collector.collect(arg.metrics) if (event === AgentSessionEventTypes.MetricsCollected) { this._usageCollector.collect((eventData as MetricsCollectedEvent).metrics); } @@ -328,10 +316,8 @@ export class AgentSession< return this._interruptionDetection; } - // Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 45-48 (diff) /** * Returns usage summaries for this session, one per model/provider combination. - * Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 45-48 (diff) */ get usage(): AgentSessionUsage { return { modelUsage: this._usageCollector.flatten() }; @@ -959,8 +945,6 @@ export class AgentSession< this.rootSpanContext = undefined; this.llmErrorCounts = 0; this.ttsErrorCounts = 0; - // Ref: python livekit-agents/livekit/agents/voice/agent_session.py - lines 53-54 (diff) - // Reset collector on session reset this._usageCollector = new ModelUsageCollector(); this.logger.info({ reason, error }, 'AgentSession closed'); diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index 0eab02545..fa50af22f 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -36,7 +36,6 @@ export class _LLMGenerationData { generatedText: string = ''; generatedToolCalls: FunctionCall[]; id: string; - // Time to first token in seconds (for TTFT span attribute) ttft?: number; constructor( @@ -382,16 +381,14 @@ export function updateInstructions(options: { } } -// Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 3-7 (diff) -// Added model and provider parameters to generation functions export function performLLMInference( node: LLMNode, chatCtx: ChatContext, toolCtx: ToolContext, modelSettings: ModelSettings, controller: AbortController, - model?: string, // Ref: line 5 (model: str | None = None) - provider?: string, // Ref: line 6 (provider: str | None = None) + model?: string, + provider?: string, ): [Task, _LLMGenerationData] { const textStream = new IdentityTransform(); const toolCallStream = new IdentityTransform(); @@ -407,20 +404,15 @@ export function performLLMInference( ); span.setAttribute(traceTypes.ATTR_FUNCTION_TOOLS, JSON.stringify(Object.keys(toolCtx))); - // Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 36-48 (diff) - // Set model/provider attributes on the span if (model) { - // Ref: lines 44-45 span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); } if (provider) { - // Ref: lines 46-47 span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); } let llmStreamReader: ReadableStreamDefaultReader | null = null; let llmStream: ReadableStream | null = null; - // Track start time for TTFT calculation const startTime = performance.now() / 1000; // Convert to seconds let firstTokenReceived = false; @@ -445,7 +437,6 @@ export function performLLMInference( const { done, value: chunk } = result; if (done) break; - // Track time to first token if (!firstTokenReceived) { firstTokenReceived = true; data.ttft = performance.now() / 1000 - startTime; @@ -521,35 +512,28 @@ export function performLLMInference( ]; } -// Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 77-82 (diff) -// Added model and provider parameters for TTS generation export function performTTSInference( node: TTSNode, text: ReadableStream, modelSettings: ModelSettings, controller: AbortController, - model?: string, // Ref: line 79 (model: str | None = None) - provider?: string, // Ref: line 80 (provider: str | None = None) + model?: string, + provider?: string, ): [Task, ReadableStream] { const audioStream = new IdentityTransform(); const outputWriter = audioStream.writable.getWriter(); const audioOutputStream = audioStream.readable; const _performTTSInferenceImpl = async (signal: AbortSignal, span: Span) => { - // Ref: python livekit-agents/livekit/agents/voice/generation.py - lines 77-82 (diff) - // Set model/provider attributes on the span if (model) { - // Ref: lines 79-80 span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); } if (provider) { - // Ref: lines 81-82 span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); } let ttsStreamReader: ReadableStreamDefaultReader | null = null; let ttsStream: ReadableStream | null = null; - // Track start time for TTFB calculation const startTime = performance.now() / 1000; // Convert to seconds let firstByteReceived = false; @@ -570,7 +554,6 @@ export function performTTSInference( break; } - // Track time to first byte and set span attribute if (!firstByteReceived) { firstByteReceived = true; const ttfb = performance.now() / 1000 - startTime; @@ -663,7 +646,6 @@ export function performTextForwarding( export interface _AudioOut { audio: Array; - /** Future that will be set with the timestamp of the first frame's capture */ firstFrameFut: Future; } @@ -751,7 +733,6 @@ export function performAudioForwarding( ]; } -// function_tool span is already implemented in tracableToolExecution below (line ~796) export function performToolExecutions({ session, speechHandle, From 3ce96e18559db886710b08b1db8aea17672ac3e5 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Tue, 3 Feb 2026 00:41:39 -0800 Subject: [PATCH 26/26] Update realtime_api.ts --- plugins/google/src/beta/realtime/realtime_api.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/plugins/google/src/beta/realtime/realtime_api.ts b/plugins/google/src/beta/realtime/realtime_api.ts index 83d8a5aa1..4b547cd3d 100644 --- a/plugins/google/src/beta/realtime/realtime_api.ts +++ b/plugins/google/src/beta/realtime/realtime_api.ts @@ -774,6 +774,8 @@ export class RealtimeSession extends llm.RealtimeSession { onmessage: (message: types.LiveServerMessage) => { this.onReceiveMessage(session, message); }, + // onerror is called for network-level errors (connection refused, DNS failure, TLS errors). + // Application-level errors (e.g., invalid model name) come through onclose with error codes. onerror: (error: ErrorEvent) => { this.#logger.error('Gemini Live session error:', error); if (!this.sessionShouldClose.isSet) { @@ -781,7 +783,15 @@ export class RealtimeSession extends llm.RealtimeSession { } }, onclose: (event: CloseEvent) => { - this.#logger.debug('Gemini Live session closed:', event.code, event.reason); + // Surface WebSocket close errors to the user instead of silently swallowing them + // Close code 1000 = normal closure, anything else is an error + if (event.code !== 1000) { + const errorMsg = event.reason || `WebSocket closed with code ${event.code}`; + this.#logger.error(`Gemini Live session error: ${errorMsg}`); + this.emitError(new Error(errorMsg), false); + } else { + this.#logger.debug('Gemini Live session closed:', event.code, event.reason); + } this.markCurrentGenerationDone(); }, },