diff --git a/.changeset/config.json b/.changeset/config.json index af66336b2..29b38eb85 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -8,13 +8,7 @@ ], "commit": false, "ignore": ["livekit-agents-examples"], - "fixed": [ - [ - "@livekit/agents", - "@livekit/agents-plugin-*", - "@livekit/agents-plugins-test" - ] - ], + "fixed": [["@livekit/agents", "@livekit/agents-plugin-*", "@livekit/agents-plugins-test"]], "access": "public", "baseBranch": "main", "updateInternalDependencies": "patch", diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5a577688..b4472c81b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,11 +46,11 @@ jobs: - name: Test agents if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name == 'push' run: pnpm test agents - - name: Test examples - if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') && secrets.OPENAI_API_KEY != '' - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: pnpm test:examples + # - name: Test examples + # if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') + # env: + # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # run: pnpm test:examples # TODO (AJS-83) Re-enable once plugins are refactored with abort controllers # - name: Test all plugins # if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name != 'pull_request' diff --git a/agents/package.json b/agents/package.json index 001e79200..ebf5e0b72 100644 --- a/agents/package.json +++ b/agents/package.json @@ -69,6 +69,7 @@ "heap-js": "^2.6.0", "json-schema": "^0.4.0", "livekit-server-sdk": "^2.14.1", + "ofetch": "^1.5.1", "openai": "^6.8.1", "pidusage": "^4.0.1", "pino": "^8.19.0", diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts new file mode 100644 index 000000000..66f2fe85f --- /dev/null +++ b/agents/src/inference/interruption/defaults.ts @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { ApiConnectOptions } from './interruption_stream.js'; +import type { InterruptionOptions } from './types.js'; + +export const MIN_INTERRUPTION_DURATION_IN_S = 0.025 * 2; // 25ms per frame, 2 consecutive frames +export const THRESHOLD = 0.65; +export const MAX_AUDIO_DURATION_IN_S = 3.0; +export const AUDIO_PREFIX_DURATION_IN_S = 0.5; +export const DETECTION_INTERVAL_IN_S = 0.1; +export const REMOTE_INFERENCE_TIMEOUT_IN_S = 1.0; +export const SAMPLE_RATE = 16000; +export const FRAMES_PER_SECOND = 40; +export const FRAME_DURATION_IN_S = 0.025; // 25ms per frame + +/** Default production inference URL */ +export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; + +/** Staging inference URL */ +export const STAGING_BASE_URL = 'https://agent-gateway-staging.livekit.cloud/v1'; + +/** + * Get the default inference URL based on the environment. + * + * Priority: + * 1. LIVEKIT_INFERENCE_URL if set + * 2. If LIVEKIT_URL contains '.staging.livekit.cloud', use staging gateway + * 3. Otherwise, use production gateway + */ +export function getDefaultInferenceUrl(): string { + // Priority 1: LIVEKIT_INFERENCE_URL + const inferenceUrl = process.env.LIVEKIT_INFERENCE_URL; + if (inferenceUrl) { + return inferenceUrl; + } + + // Priority 2: Check LIVEKIT_URL for staging (exact match to Python) + const livekitUrl = process.env.LIVEKIT_URL || ''; + if (livekitUrl.includes('.staging.livekit.cloud')) { + return STAGING_BASE_URL; + } + + // Priority 3: Default to production + return DEFAULT_BASE_URL; +} + +export const apiConnectDefaults: ApiConnectOptions = { + maxRetries: 3, + retryInterval: 2_000, + timeout: 10_000, +} as const; + +/** + * Calculate the retry interval using exponential backoff with jitter. + * Matches the Python implementation's _interval_for_retry behavior. + */ +export function intervalForRetry( + attempt: number, + baseInterval: number = apiConnectDefaults.retryInterval, +): number { + // Exponential backoff: baseInterval * 2^attempt with some jitter + const exponentialDelay = baseInterval * Math.pow(2, attempt); + // Add jitter (0-25% of the delay) + const jitter = exponentialDelay * Math.random() * 0.25; + return exponentialDelay + jitter; +} + +// baseUrl and useProxy are resolved dynamically in the constructor +// to respect LIVEKIT_REMOTE_EOT_URL environment variable +export const interruptionOptionDefaults: Omit = { + sampleRate: SAMPLE_RATE, + threshold: THRESHOLD, + minFrames: Math.ceil(MIN_INTERRUPTION_DURATION_IN_S * FRAMES_PER_SECOND), + maxAudioDurationInS: MAX_AUDIO_DURATION_IN_S, + audioPrefixDurationInS: AUDIO_PREFIX_DURATION_IN_S, + detectionIntervalInS: DETECTION_INTERVAL_IN_S, + inferenceTimeout: REMOTE_INFERENCE_TIMEOUT_IN_S * 1_000, + apiKey: process.env.LIVEKIT_API_KEY || '', + apiSecret: process.env.LIVEKIT_API_SECRET || '', + minInterruptionDurationInS: MIN_INTERRUPTION_DURATION_IN_S, +} as const; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts new file mode 100644 index 000000000..30842fc0f --- /dev/null +++ b/agents/src/inference/interruption/errors.ts @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +/** + * Error thrown during interruption detection. + */ +export class InterruptionDetectionError extends Error { + readonly type = 'InterruptionDetectionError'; + + readonly timestamp: number; + readonly label: string; + readonly recoverable: boolean; + + constructor(message: string, timestamp: number, label: string, recoverable: boolean) { + super(message); + this.name = 'InterruptionDetectionError'; + this.timestamp = timestamp; + this.label = label; + this.recoverable = recoverable; + } + + toString(): string { + return `${this.name}: ${this.message} (label=${this.label}, timestamp=${this.timestamp}, recoverable=${this.recoverable})`; + } +} diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts new file mode 100644 index 000000000..43a7f4e05 --- /dev/null +++ b/agents/src/inference/interruption/http_transport.ts @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { ofetch } from 'ofetch'; +import { TransformStream } from 'stream/web'; +import { z } from 'zod'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { intervalForRetry } from './defaults.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import { type InterruptionEvent, InterruptionEventType } from './types.js'; +import type { BoundedCache } from './utils.js'; + +export interface PostOptions { + baseUrl: string; + token: string; + signal?: AbortSignal; + timeout?: number; + maxRetries?: number; +} + +export interface PredictOptions { + threshold: number; + minFrames: number; +} + +export const predictEndpointResponseSchema = z.object({ + created_at: z.number(), + is_bargein: z.boolean(), + probabilities: z.array(z.number()), +}); + +export type PredictEndpointResponse = z.infer; + +export interface PredictResponse { + createdAt: number; + isBargein: boolean; + probabilities: number[]; + predictionDurationInS: number; +} + +export async function predictHTTP( + data: Int16Array, + predictOptions: PredictOptions, + options: PostOptions, +): Promise { + const createdAt = performance.now(); + const url = new URL(`/bargein`, options.baseUrl); + url.searchParams.append('threshold', predictOptions.threshold.toString()); + url.searchParams.append('min_frames', predictOptions.minFrames.toFixed()); + url.searchParams.append('created_at', createdAt.toFixed()); + + let retryCount = 0; + const response = await ofetch(url.toString(), { + retry: options.maxRetries ?? 3, + retryDelay: () => { + const delay = intervalForRetry(retryCount); + retryCount++; + return delay; + }, + headers: { + 'Content-Type': 'application/octet-stream', + Authorization: `Bearer ${options.token}`, + }, + signal: options.signal, + timeout: options.timeout, + method: 'POST', + body: data, + }); + const { created_at, is_bargein, probabilities } = predictEndpointResponseSchema.parse(response); + + return { + createdAt: created_at, + isBargein: is_bargein, + probabilities, + predictionDurationInS: (performance.now() - createdAt) / 1000, + }; +} + +export interface HttpTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + threshold: number; + minFrames: number; + timeout: number; + maxRetries?: number; +} + +export interface HttpTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: BoundedCache; +} + +/** + * Creates an HTTP transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * Each audio slice triggers an HTTP POST request. + * + * @param options - Transport options object. This is read on each request, so mutations + * to threshold/minFrames will be picked up dynamically. + */ +export function createHttpTransport( + options: HttpTransportOptions, + getState: () => HttpTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, +): TransformStream { + const logger = log(); + + return new TransformStream( + { + async transform(chunk, controller) { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + if (!state.overlapSpeechStartedAt) return; + + try { + const resp = await predictHTTP( + chunk, + { threshold: options.threshold, minFrames: options.minFrames }, + { + baseUrl: options.baseUrl, + timeout: options.timeout, + maxRetries: options.maxRetries, + token: await createAccessToken(options.apiKey, options.apiSecret), + }, + ); + + const { createdAt, isBargein, probabilities, predictionDurationInS } = resp; + const entry = new InterruptionCacheEntry({ + createdAt, + probabilities, + isInterruption: isBargein, + speechInput: chunk, + totalDurationInS: (performance.now() - createdAt) / 1000, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + predictionDurationInS, + }); + state.cache.set(createdAt, entry); + + if (state.overlapSpeechStarted && entry.isInterruption) { + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + overlapSpeechStartedAt: state.overlapSpeechStartedAt, + isInterruption: entry.isInterruption, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + }; + logger.debug( + { + detectionDelayInS: entry.detectionDelayInS, + totalDurationInS: entry.totalDurationInS, + }, + 'interruption detected', + ); + setState({ overlapSpeechStarted: false }); + controller.enqueue(event); + } + } catch (err) { + logger.error({ err }, 'Failed to send audio data over HTTP'); + } + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); +} diff --git a/agents/src/inference/interruption/interruption_cache_entry.ts b/agents/src/inference/interruption/interruption_cache_entry.ts new file mode 100644 index 000000000..600e25da6 --- /dev/null +++ b/agents/src/inference/interruption/interruption_cache_entry.ts @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { estimateProbability } from './utils.js'; + +/** + * Typed cache entry for interruption inference results. + * Mutable to support setOrUpdate pattern from Python's _BoundedCache. + */ +export class InterruptionCacheEntry { + createdAt: number; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + speechInput?: Int16Array; + probabilities?: number[]; + isInterruption?: boolean; + + constructor(params: { + createdAt: number; + speechInput?: Int16Array; + totalDurationInS?: number; + predictionDurationInS?: number; + detectionDelayInS?: number; + probabilities?: number[]; + isInterruption?: boolean; + }) { + this.createdAt = params.createdAt; + this.totalDurationInS = params.totalDurationInS ?? 0; + this.predictionDurationInS = params.predictionDurationInS ?? 0; + this.detectionDelayInS = params.detectionDelayInS ?? 0; + this.speechInput = params.speechInput; + this.probabilities = params.probabilities; + this.isInterruption = params.isInterruption; + } + + /** + * The conservative estimated probability of the interruption event. + */ + get probability(): number { + return this.probabilities ? estimateProbability(this.probabilities) : 0; + } + + static default(): InterruptionCacheEntry { + return new InterruptionCacheEntry({ createdAt: 0 }); + } +} diff --git a/agents/src/inference/interruption/interruption_detector.ts b/agents/src/inference/interruption/interruption_detector.ts new file mode 100644 index 000000000..a5a457072 --- /dev/null +++ b/agents/src/inference/interruption/interruption_detector.ts @@ -0,0 +1,192 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { TypedEventEmitter } from '@livekit/typed-emitter'; +import EventEmitter from 'events'; +import { log } from '../../log.js'; +import { + DEFAULT_BASE_URL, + FRAMES_PER_SECOND, + SAMPLE_RATE, + STAGING_BASE_URL, + getDefaultInferenceUrl, + interruptionOptionDefaults, +} from './defaults.js'; +import type { InterruptionDetectionError } from './errors.js'; +import { InterruptionStreamBase } from './interruption_stream.js'; +import type { InterruptionEvent, InterruptionOptions } from './types.js'; + +type InterruptionCallbacks = { + user_interruption_detected: (event: InterruptionEvent) => void; + user_non_interruption_detected: (event: InterruptionEvent) => void; + error: (error: InterruptionDetectionError) => void; +}; + +export type AdaptiveInterruptionDetectorOptions = Omit, 'useProxy'>; + +export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { + options: InterruptionOptions; + private readonly _label: string; + private logger = log(); + // Use Set instead of WeakSet to allow iteration for propagating option updates + private streams: Set = new Set(); + + constructor(options: AdaptiveInterruptionDetectorOptions = {}) { + super(); + + const { + maxAudioDurationInS, + baseUrl, + apiKey, + apiSecret, + audioPrefixDurationInS, + threshold, + detectionIntervalInS, + inferenceTimeout, + minInterruptionDurationInS, + } = { ...interruptionOptionDefaults, ...options }; + + if (maxAudioDurationInS > 3.0) { + throw new Error('maxAudioDurationInS must be less than or equal to 3.0 seconds'); + } + + const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? getDefaultInferenceUrl(); + let lkApiKey = apiKey ?? ''; + let lkApiSecret = apiSecret ?? ''; + let useProxy: boolean; + + // Use LiveKit credentials if using the inference service (production or staging) + const isInferenceUrl = lkBaseUrl === DEFAULT_BASE_URL || lkBaseUrl === STAGING_BASE_URL; + if (isInferenceUrl) { + lkApiKey = + apiKey ?? process.env.LIVEKIT_INFERENCE_API_KEY ?? process.env.LIVEKIT_API_KEY ?? ''; + if (!lkApiKey) { + throw new Error( + 'apiKey is required, either as argument or set LIVEKIT_API_KEY environmental variable', + ); + } + + lkApiSecret = + apiSecret ?? + process.env.LIVEKIT_INFERENCE_API_SECRET ?? + process.env.LIVEKIT_API_SECRET ?? + ''; + if (!lkApiSecret) { + throw new Error( + 'apiSecret is required, either as argument or set LIVEKIT_API_SECRET environmental variable', + ); + } + useProxy = true; + } else { + useProxy = false; + } + + this.options = { + sampleRate: SAMPLE_RATE, + threshold, + minFrames: Math.ceil(minInterruptionDurationInS * FRAMES_PER_SECOND), + maxAudioDurationInS, + audioPrefixDurationInS, + detectionIntervalInS, + inferenceTimeout, + baseUrl: lkBaseUrl, + apiKey: lkApiKey, + apiSecret: lkApiSecret, + useProxy, + minInterruptionDurationInS, + }; + + this._label = `${this.constructor.name}`; + + this.logger.debug( + { + baseUrl: this.options.baseUrl, + detectionIntervalInS: this.options.detectionIntervalInS, + audioPrefixDurationInS: this.options.audioPrefixDurationInS, + maxAudioDurationInS: this.options.maxAudioDurationInS, + minFrames: this.options.minFrames, + threshold: this.options.threshold, + inferenceTimeout: this.options.inferenceTimeout, + useProxy: this.options.useProxy, + }, + 'adaptive interruption detector initialized', + ); + } + + /** + * The model identifier for this detector. + */ + get model(): string { + return 'adaptive interruption'; + } + + /** + * The provider identifier for this detector. + */ + get provider(): string { + return 'livekit'; + } + + /** + * The label for this detector instance. + */ + get label(): string { + return this._label; + } + + /** + * The sample rate used for audio processing. + */ + get sampleRate(): number { + return this.options.sampleRate; + } + + /** + * Emit an error event from the detector. + */ + emitError(error: InterruptionDetectionError): void { + this.emit('error', error); + } + + /** + * Creates a new InterruptionStreamBase for internal use. + * The stream can receive audio frames and sentinels via pushFrame(). + * Use this when you need direct access to the stream for pushing frames. + */ + createStream(): InterruptionStreamBase { + const streamBase = new InterruptionStreamBase(this, {}); + this.streams.add(streamBase); + return streamBase; + } + + /** + * Remove a stream from tracking (called when stream is closed). + */ + removeStream(stream: InterruptionStreamBase): void { + this.streams.delete(stream); + } + + /** + * Update options for the detector and propagate to all active streams. + * For WebSocket streams, this triggers a reconnection with new settings. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + } + + // Propagate option updates to all active streams (matching Python behavior) + const updatePromises: Promise[] = []; + for (const stream of this.streams) { + updatePromises.push(stream.updateOptions(options)); + } + await Promise.all(updatePromises); + } +} diff --git a/agents/src/inference/interruption/interruption_stream.ts b/agents/src/inference/interruption/interruption_stream.ts new file mode 100644 index 000000000..ef0a22e89 --- /dev/null +++ b/agents/src/inference/interruption/interruption_stream.ts @@ -0,0 +1,424 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; +import type { Span } from '@opentelemetry/api'; +import { type ReadableStream, TransformStream } from 'stream/web'; +import { log } from '../../log.js'; +import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { traceTypes } from '../../telemetry/index.js'; +import { FRAMES_PER_SECOND, apiConnectDefaults } from './defaults.js'; +import type { InterruptionDetectionError } from './errors.js'; +import { createHttpTransport } from './http_transport.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import type { AdaptiveInterruptionDetector } from './interruption_detector.js'; +import { + type AgentSpeechEnded, + type AgentSpeechStarted, + type ApiConnectOptions, + type Flush, + type InterruptionEvent, + InterruptionEventType, + type InterruptionOptions, + type InterruptionSentinel, + type OverlapSpeechEnded, + type OverlapSpeechStarted, +} from './types.js'; +import { BoundedCache } from './utils.js'; +import { createWsTransport } from './ws_transport.js'; + +// Re-export sentinel types for backwards compatibility +export type { + AgentSpeechEnded, + AgentSpeechStarted, + ApiConnectOptions, + Flush, + InterruptionSentinel, + OverlapSpeechEnded, + OverlapSpeechStarted, +}; + +export class InterruptionStreamSentinel { + static agentSpeechStarted(): AgentSpeechStarted { + return { type: 'agent-speech-started' }; + } + + static agentSpeechEnded(): AgentSpeechEnded { + return { type: 'agent-speech-ended' }; + } + + static overlapSpeechStarted( + speechDurationInS: number, + userSpeakingSpan?: Span, + ): OverlapSpeechStarted { + return { type: 'overlap-speech-started', speechDurationInS, userSpeakingSpan }; + } + + static overlapSpeechEnded(): OverlapSpeechEnded { + return { type: 'overlap-speech-ended' }; + } + + static flush(): Flush { + return { type: 'flush' }; + } +} + +function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { + span.setAttribute( + traceTypes.ATTR_IS_INTERRUPTION, + (entry.isInterruption ?? false).toString().toLowerCase(), + ); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PROBABILITY, entry.probability); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelayInS); +} + +export class InterruptionStreamBase { + private inputStream: StreamChannel; + + private eventStream: ReadableStream; + + private resampler?: AudioResampler; + + private userSpeakingSpan: Span | undefined; + + private overlapSpeechStartedAt: number | undefined; + + private options: InterruptionOptions; + + private apiOptions: ApiConnectOptions; + + private model: AdaptiveInterruptionDetector; + + private logger = log(); + + // Store reconnect function for WebSocket transport + private wsReconnect?: () => Promise; + + // Mutable transport options that can be updated via updateOptions() + private transportOptions: { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; + maxRetries: number; + }; + + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { + this.inputStream = createStreamChannel< + InterruptionSentinel | AudioFrame, + InterruptionDetectionError + >(); + + this.model = model; + this.options = { ...model.options }; + this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; + + // Initialize mutable transport options + this.transportOptions = { + baseUrl: this.options.baseUrl, + apiKey: this.options.apiKey, + apiSecret: this.options.apiSecret, + sampleRate: this.options.sampleRate, + threshold: this.options.threshold, + minFrames: this.options.minFrames, + timeout: this.options.inferenceTimeout, + maxRetries: this.apiOptions.maxRetries, + }; + + this.eventStream = this.setupTransform(); + } + + /** + * Update stream options. For WebSocket transport, this triggers a reconnection. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + this.transportOptions.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + this.transportOptions.minFrames = this.options.minFrames; + } + // Trigger WebSocket reconnection if using proxy (WebSocket transport) + if (this.options.useProxy && this.wsReconnect) { + await this.wsReconnect(); + } + } + + private setupTransform(): ReadableStream { + let agentSpeechStarted = false; + let startIdx = 0; + let accumulatedSamples = 0; + let overlapSpeechStarted = false; + // Use BoundedCache with max_len=10 to prevent unbounded memory growth + const cache = new BoundedCache(10); + const inferenceS16Data = new Int16Array( + Math.ceil(this.options.maxAudioDurationInS * this.options.sampleRate), + ).fill(0); + + // State accessors for transport + const getState = () => ({ + overlapSpeechStarted, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + cache, + }); + const setState = (partial: { overlapSpeechStarted?: boolean }) => { + if (partial.overlapSpeechStarted !== undefined) { + overlapSpeechStarted = partial.overlapSpeechStarted; + } + }; + const handleSpanUpdate = (entry: InterruptionCacheEntry) => { + if (this.userSpeakingSpan) { + updateUserSpeakingSpan(this.userSpeakingSpan, entry); + this.userSpeakingSpan = undefined; + } + }; + + // First transform: process input frames/sentinels and output audio slices or events + const audioTransformer = new TransformStream< + InterruptionSentinel | AudioFrame, + Int16Array | InterruptionEvent + >( + { + transform: (chunk, controller) => { + if (chunk instanceof AudioFrame) { + if (!agentSpeechStarted) { + return; + } + if (this.options.sampleRate !== chunk.sampleRate) { + controller.error('the sample rate of the input frames must be consistent'); + return; + } + const result = writeToInferenceS16Data( + chunk, + startIdx, + inferenceS16Data, + this.options.maxAudioDurationInS, + ); + startIdx = result.startIdx; + accumulatedSamples += result.samplesWritten; + + // Send data for inference when enough samples accumulated during overlap + if ( + accumulatedSamples >= + Math.floor(this.options.detectionIntervalInS * this.options.sampleRate) && + overlapSpeechStarted + ) { + // Send a copy of the audio data up to startIdx for inference + const audioSlice = inferenceS16Data.slice(0, startIdx); + accumulatedSamples = 0; + controller.enqueue(audioSlice); + } + } else if (chunk.type === 'agent-speech-started') { + this.logger.debug('agent speech started'); + agentSpeechStarted = true; + overlapSpeechStarted = false; + accumulatedSamples = 0; + startIdx = 0; + cache.clear(); + } else if (chunk.type === 'agent-speech-ended') { + this.logger.debug('agent speech ended'); + agentSpeechStarted = false; + overlapSpeechStarted = false; + accumulatedSamples = 0; + startIdx = 0; + cache.clear(); + } else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) { + this.userSpeakingSpan = chunk.userSpeakingSpan; + this.logger.debug('overlap speech started, starting interruption inference'); + overlapSpeechStarted = true; + accumulatedSamples = 0; + // Include both speech duration and audio prefix duration for context + const shiftSize = Math.min( + startIdx, + Math.round(chunk.speechDurationInS * this.options.sampleRate) + + Math.round(this.options.audioPrefixDurationInS * this.options.sampleRate), + ); + // Shift the buffer: copy the last `shiftSize` samples before startIdx + // to the beginning of the buffer. This preserves recent audio context + // (the user's speech that occurred just before overlap was detected). + inferenceS16Data.copyWithin(0, startIdx - shiftSize, startIdx); + startIdx = shiftSize; + cache.clear(); + } else if (chunk.type === 'overlap-speech-ended') { + this.logger.debug('overlap speech ended'); + if (overlapSpeechStarted) { + this.userSpeakingSpan = undefined; + // Use pop with predicate to get only completed requests (matching Python behavior) + // This ensures we don't return incomplete/in-flight requests as the "final" result + let latestEntry = cache.pop( + (entry) => entry.totalDurationInS !== undefined && entry.totalDurationInS > 0, + ); + if (!latestEntry) { + this.logger.debug('no request made for overlap speech'); + latestEntry = InterruptionCacheEntry.default(); + } + const event: InterruptionEvent = { + type: InterruptionEventType.OVERLAP_SPEECH_ENDED, + timestamp: Date.now(), + isInterruption: false, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + speechInput: latestEntry.speechInput, + probabilities: latestEntry.probabilities, + totalDurationInS: latestEntry.totalDurationInS, + detectionDelayInS: latestEntry.detectionDelayInS, + predictionDurationInS: latestEntry.predictionDurationInS, + probability: latestEntry.probability, + }; + controller.enqueue(event); + overlapSpeechStarted = false; + } + } else if (chunk.type === 'flush') { + // no-op + } + }, + }, + { highWaterMark: 32 }, + { highWaterMark: 32 }, + ); + + // Second transform: transport layer (HTTP or WebSocket based on useProxy) + const transportOptions = this.transportOptions; + + let transport: TransformStream; + if (this.options.useProxy) { + const wsResult = createWsTransport(transportOptions, getState, setState, handleSpanUpdate); + transport = wsResult.transport; + this.wsReconnect = wsResult.reconnect; + } else { + transport = createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); + } + + const eventEmitter = new TransformStream({ + transform: (chunk, controller) => { + if (chunk.type === InterruptionEventType.INTERRUPTION) { + this.model.emit('user_interruption_detected', chunk); + } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { + this.model.emit('user_non_interruption_detected', chunk); + } + controller.enqueue(chunk); + }, + }); + + // Pipeline: input -> audioTransformer -> transport -> eventStream + return this.inputStream + .stream() + .pipeThrough(audioTransformer) + .pipeThrough(transport) + .pipeThrough(eventEmitter); + } + + private ensureInputNotEnded() { + if (this.inputStream.closed) { + throw new Error('input stream is closed'); + } + } + + private ensureStreamsNotEnded() { + this.ensureInputNotEnded(); + } + + private getResamplerFor(inputSampleRate: number): AudioResampler { + if (!this.resampler) { + this.resampler = new AudioResampler(inputSampleRate, this.options.sampleRate); + } + return this.resampler; + } + + stream(): ReadableStream { + return this.eventStream; + } + + async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise { + this.ensureStreamsNotEnded(); + if (!(frame instanceof AudioFrame)) { + if (frame.type === 'overlap-speech-started') { + this.overlapSpeechStartedAt = Date.now() - frame.speechDurationInS * 1000; + } + return this.inputStream.write(frame); + } else if (this.options.sampleRate !== frame.sampleRate) { + const resampler = this.getResamplerFor(frame.sampleRate); + if (resampler.inputRate !== frame.sampleRate) { + throw new Error('the sample rate of the input frames must be consistent'); + } + for (const resampledFrame of resampler.push(frame)) { + await this.inputStream.write(resampledFrame); + } + } else { + await this.inputStream.write(frame); + } + } + + async flush(): Promise { + this.ensureStreamsNotEnded(); + await this.inputStream.write(InterruptionStreamSentinel.flush()); + } + + async endInput(): Promise { + await this.flush(); + await this.inputStream.close(); + } + + async close(): Promise { + if (!this.inputStream.closed) await this.inputStream.close(); + this.model.removeStream(this); + } +} + +/** + * Write the audio frame to the output data array and return the new start index + * and the number of samples written. + */ +function writeToInferenceS16Data( + frame: AudioFrame, + startIdx: number, + outData: Int16Array, + maxAudioDuration: number, +): { startIdx: number; samplesWritten: number } { + const maxWindowSize = Math.floor(maxAudioDuration * frame.sampleRate); + + if (frame.samplesPerChannel > outData.length) { + throw new Error('frame samples are greater than the max window size'); + } + + // Shift the data to the left if the window would overflow + const shift = startIdx + frame.samplesPerChannel - maxWindowSize; + if (shift > 0) { + outData.copyWithin(0, shift, startIdx); + startIdx -= shift; + } + + // Get the frame data as Int16Array + const frameData = new Int16Array( + frame.data.buffer, + frame.data.byteOffset, + frame.samplesPerChannel * frame.channels, + ); + + if (frame.channels > 1) { + // Mix down multiple channels to mono by averaging + for (let i = 0; i < frame.samplesPerChannel; i++) { + let sum = 0; + for (let ch = 0; ch < frame.channels; ch++) { + sum += frameData[i * frame.channels + ch] ?? 0; + } + outData[startIdx + i] = Math.floor(sum / frame.channels); + } + } else { + // Single channel - copy directly + outData.set(frameData, startIdx); + } + + startIdx += frame.samplesPerChannel; + return { startIdx, samplesWritten: frame.samplesPerChannel }; +} diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts new file mode 100644 index 000000000..85c771646 --- /dev/null +++ b/agents/src/inference/interruption/types.ts @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { Span } from '@opentelemetry/api'; + +/** + * Event types for interruption detection. + */ +export enum InterruptionEventType { + INTERRUPTION = 'interruption', + OVERLAP_SPEECH_ENDED = 'overlap_speech_ended', +} + +/** + * Event emitted when an interruption is detected or overlap speech ends. + */ +export interface InterruptionEvent { + type: InterruptionEventType; + timestamp: number; + isInterruption: boolean; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + overlapSpeechStartedAt?: number; + speechInput?: Int16Array; + probabilities?: number[]; + probability: number; +} + +/** + * Configuration options for interruption detection. + */ +export interface InterruptionOptions { + sampleRate: number; + threshold: number; + minFrames: number; + maxAudioDurationInS: number; + audioPrefixDurationInS: number; + detectionIntervalInS: number; + inferenceTimeout: number; + minInterruptionDurationInS: number; + baseUrl: string; + apiKey: string; + apiSecret: string; + useProxy: boolean; +} + +/** + * API connection options for transport layers. + */ +export interface ApiConnectOptions { + maxRetries: number; + retryInterval: number; + timeout: number; +} + +// Sentinel types for stream control signals + +export interface AgentSpeechStarted { + type: 'agent-speech-started'; +} + +export interface AgentSpeechEnded { + type: 'agent-speech-ended'; +} + +export interface OverlapSpeechStarted { + type: 'overlap-speech-started'; + speechDurationInS: number; + userSpeakingSpan?: Span; +} + +export interface OverlapSpeechEnded { + type: 'overlap-speech-ended'; +} + +export interface Flush { + type: 'flush'; +} + +/** + * Union type for all stream control signals. + */ +export type InterruptionSentinel = + | AgentSpeechStarted + | AgentSpeechEnded + | OverlapSpeechStarted + | OverlapSpeechEnded + | Flush; diff --git a/agents/src/inference/interruption/utils.test.ts b/agents/src/inference/interruption/utils.test.ts new file mode 100644 index 000000000..762bc5ea3 --- /dev/null +++ b/agents/src/inference/interruption/utils.test.ts @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { slidingWindowMinMax } from './utils.js'; + +describe('slidingWindowMinMax', () => { + it('returns -Infinity when array is shorter than window size', () => { + expect(slidingWindowMinMax([0.5, 0.6], 3)).toBe(-Infinity); + expect(slidingWindowMinMax([], 1)).toBe(-Infinity); + }); + + it('returns the max value when window size is 1', () => { + // With window size 1, min of each window is the element itself, + // so max of mins is just the max of the array + expect(slidingWindowMinMax([0.1, 0.5, 0.3, 0.8, 0.2], 1)).toBe(0.8); + }); + + it('finds the best sustained probability across windows', () => { + // Windows of size 3: [0.2, 0.8, 0.7], [0.8, 0.7, 0.3], [0.7, 0.3, 0.9] + // Mins: 0.2, 0.3, 0.3 + // Max of mins: 0.3 + expect(slidingWindowMinMax([0.2, 0.8, 0.7, 0.3, 0.9], 3)).toBe(0.3); + }); + + it('returns the single element when array length equals window size', () => { + // Only one window covering the entire array, return min of that window + expect(slidingWindowMinMax([0.5, 0.9, 0.7], 3)).toBe(0.5); + expect(slidingWindowMinMax([0.8], 1)).toBe(0.8); + }); +}); diff --git a/agents/src/inference/interruption/utils.ts b/agents/src/inference/interruption/utils.ts new file mode 100644 index 000000000..bd89dd512 --- /dev/null +++ b/agents/src/inference/interruption/utils.ts @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; + +/** + * A bounded cache that automatically evicts the oldest entries when the cache exceeds max size. + * Uses FIFO eviction strategy. + */ +export class BoundedCache { + private cache: Map = new Map(); + private readonly maxLen: number; + + constructor(maxLen: number = 10) { + this.maxLen = maxLen; + } + + set(key: K, value: V): void { + this.cache.set(key, value); + if (this.cache.size > this.maxLen) { + // Remove the oldest entry (first inserted) + const firstKey = this.cache.keys().next().value as K; + this.cache.delete(firstKey); + } + } + + get(key: K): V | undefined { + return this.cache.get(key); + } + + has(key: K): boolean { + return this.cache.has(key); + } + + delete(key: K): boolean { + return this.cache.delete(key); + } + + /** + * Pop the last entry that matches the predicate, or return undefined. + * Only removes and returns the matching entry, preserving others. + */ + pop(predicate?: (value: V) => boolean): V | undefined { + if (predicate === undefined) { + // Pop the last (most recent) entry + const keys = Array.from(this.cache.keys()); + if (keys.length === 0) return undefined; + const lastKey = keys[keys.length - 1]!; + const value = this.cache.get(lastKey); + this.cache.delete(lastKey); + return value; + } + + // Find the last entry matching the predicate (iterating in reverse) + const keys = Array.from(this.cache.keys()); + for (let i = keys.length - 1; i >= 0; i--) { + const key = keys[i]!; + const value = this.cache.get(key)!; + if (predicate(value)) { + this.cache.delete(key); + return value; + } + } + return undefined; + } + + clear(): void { + this.cache.clear(); + } + + get size(): number { + return this.cache.size; + } + + values(): IterableIterator { + return this.cache.values(); + } + + keys(): IterableIterator { + return this.cache.keys(); + } + + entries(): IterableIterator<[K, V]> { + return this.cache.entries(); + } +} + +/** + * Estimate probability using sliding window min-max algorithm. + * Returns a conservative estimate based on the minimum window size. + */ +export function estimateProbability( + probabilities: number[], + windowSizeInS: number = MIN_INTERRUPTION_DURATION_IN_S, +): number { + const minWindow = Math.ceil(windowSizeInS / FRAME_DURATION_IN_S); + if (probabilities.length < minWindow) { + return 0; + } + + return slidingWindowMinMax(probabilities, minWindow); +} + +export function slidingWindowMinMax(probabilities: number[], minWindow: number): number { + if (probabilities.length < minWindow) { + return -Infinity; + } + + let maxOfMins = -Infinity; + + for (let i = 0; i <= probabilities.length - minWindow; i++) { + const windowMin = Math.min(...probabilities.slice(i, i + minWindow)); + maxOfMins = Math.max(maxOfMins, windowMin); + } + + return maxOfMins; +} diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts new file mode 100644 index 000000000..edf6d81a9 --- /dev/null +++ b/agents/src/inference/interruption/ws_transport.ts @@ -0,0 +1,354 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { TransformStream } from 'stream/web'; +import WebSocket from 'ws'; +import { z } from 'zod'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { intervalForRetry } from './defaults.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import { type InterruptionEvent, InterruptionEventType } from './types.js'; +import type { BoundedCache } from './utils.js'; + +// WebSocket message types +const MSG_SESSION_CREATE = 'session.create'; +const MSG_SESSION_CLOSE = 'session.close'; +const MSG_SESSION_CREATED = 'session.created'; +const MSG_SESSION_CLOSED = 'session.closed'; +const MSG_INTERRUPTION_DETECTED = 'bargein_detected'; +const MSG_INFERENCE_DONE = 'inference_done'; +const MSG_ERROR = 'error'; + +export interface WsTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; + maxRetries?: number; +} + +export interface WsTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: BoundedCache; +} + +const wsMessageSchema = z.union([ + z.object({ + type: z.literal(MSG_SESSION_CREATED).or(z.literal(MSG_SESSION_CLOSED)), + }), + z.object({ + type: z.literal(MSG_INTERRUPTION_DETECTED).or(z.literal(MSG_INFERENCE_DONE)), + created_at: z.number().optional(), + probabilities: z.array(z.number()).optional(), + prediction_duration: z.number().optional(), + is_bargein: z.boolean().optional(), + }), + z.object({ + type: z.literal('error'), + message: z.string(), + }), +]); + +type WsMessage = z.infer; + +/** + * Creates a WebSocket connection and waits for it to open. + */ +async function connectWebSocket(options: WsTransportOptions): Promise { + const baseUrl = options.baseUrl.replace(/^http/, 'ws'); + const token = await createAccessToken(options.apiKey, options.apiSecret); + const url = `${baseUrl}/bargein`; + + const ws = new WebSocket(url, { + headers: { Authorization: `Bearer ${token}` }, + }); + + await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + ws.terminate(); + reject(new Error('WebSocket connection timeout')); + }, options.timeout); + ws.once('open', () => { + clearTimeout(timeout); + resolve(); + }); + ws.once('error', (err: Error) => { + clearTimeout(timeout); + ws.terminate(); + reject(err); + }); + }); + + return ws; +} + +export interface WsTransportResult { + transport: TransformStream; + reconnect: () => Promise; +} + +/** + * Creates a WebSocket transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * It maintains a persistent WebSocket connection with automatic retry on failure. + * Returns both the transport and a reconnect function for option updates. + */ +export function createWsTransport( + options: WsTransportOptions, + getState: () => WsTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, +): WsTransportResult { + const logger = log(); + let ws: WebSocket | null = null; + let outputController: TransformStreamDefaultController | null = null; + + function setupMessageHandler(socket: WebSocket): void { + socket.on('message', (data: WebSocket.Data) => { + try { + const message = wsMessageSchema.parse(JSON.parse(data.toString())); + handleMessage(message); + } catch { + logger.warn({ data: data.toString() }, 'Failed to parse WebSocket message'); + } + }); + + socket.on('error', (err: Error) => { + logger.error({ err }, 'WebSocket error'); + }); + + socket.on('close', (code: number, reason: Buffer) => { + logger.debug({ code, reason: reason.toString() }, 'WebSocket closed'); + }); + } + + async function ensureConnection(): Promise { + if (ws && ws.readyState === WebSocket.OPEN) return; + + const maxRetries = options.maxRetries ?? 3; + let lastError: Error | null = null; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + ws = await connectWebSocket(options); + setupMessageHandler(ws); + + // Send session.create message + const sessionCreateMsg = JSON.stringify({ + type: MSG_SESSION_CREATE, + settings: { + sample_rate: options.sampleRate, + num_channels: 1, + threshold: options.threshold, + min_frames: options.minFrames, + encoding: 's16le', + }, + }); + ws.send(sessionCreateMsg); + return; + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + if (attempt < maxRetries) { + const delay = intervalForRetry(attempt); + logger.debug( + { attempt, delay, err: lastError.message }, + 'WebSocket connection failed, retrying', + ); + await new Promise((resolve) => setTimeout(resolve, delay)); + } + } + } + + throw lastError ?? new Error('Failed to connect to WebSocket after retries'); + } + + function handleMessage(message: WsMessage): void { + const state = getState(); + + switch (message.type) { + case MSG_SESSION_CREATED: + logger.debug('WebSocket session created'); + break; + + case MSG_INTERRUPTION_DETECTED: { + const createdAt = message.created_at ?? 0; + if (state.overlapSpeechStarted && state.overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const entry = new InterruptionCacheEntry({ + createdAt, + speechInput: existing?.speechInput, + totalDurationInS: (performance.now() - createdAt) / 1000, + probabilities: message.probabilities, + isInterruption: true, + predictionDurationInS: message.prediction_duration ?? 0, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + }); + state.cache.set(createdAt, entry); + + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + + logger.debug( + { + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + }, + 'interruption detected', + ); + + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + isInterruption: true, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + overlapSpeechStartedAt: state.overlapSpeechStartedAt, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + }; + + outputController?.enqueue(event); + setState({ overlapSpeechStarted: false }); + } + break; + } + + case MSG_INFERENCE_DONE: { + const createdAt = message.created_at ?? 0; + if (state.overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const entry = new InterruptionCacheEntry({ + createdAt, + speechInput: existing?.speechInput, + totalDurationInS: (performance.now() - createdAt) / 1000, + predictionDurationInS: message.prediction_duration ?? 0, + probabilities: message.probabilities, + isInterruption: message.is_bargein ?? false, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + }); + state.cache.set(createdAt, entry); + + logger.debug( + { + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + }, + 'interruption inference done', + ); + } + break; + } + + case MSG_SESSION_CLOSED: + logger.debug('WebSocket session closed'); + break; + + case MSG_ERROR: + outputController?.error(new Error(`LiveKit Interruption error: ${message.message}`)); + break; + } + } + + function sendAudioData(audioSlice: Int16Array): void { + if (!ws || ws.readyState !== WebSocket.OPEN) { + throw new Error('WebSocket not connected'); + } + + const state = getState(); + // Use truncated timestamp consistently for both cache key and header + // This ensures the server's response created_at matches our cache key + const createdAt = Math.floor(performance.now()); + + // Store the audio data in cache with truncated timestamp + state.cache.set(createdAt, new InterruptionCacheEntry({ createdAt, speechInput: audioSlice })); + + // Create header: 8-byte little-endian uint64 timestamp (milliseconds as integer) + const header = new ArrayBuffer(8); + const view = new DataView(header); + view.setUint32(0, createdAt >>> 0, true); + view.setUint32(4, Math.floor(createdAt / 0x100000000) >>> 0, true); + + // Combine header and audio data + const audioBytes = new Uint8Array( + audioSlice.buffer, + audioSlice.byteOffset, + audioSlice.byteLength, + ); + const combined = new Uint8Array(8 + audioBytes.length); + combined.set(new Uint8Array(header), 0); + combined.set(audioBytes, 8); + + try { + ws.send(combined); + } catch (e: unknown) { + logger.error(e, `failed to send audio via websocket`); + } + } + + function close(): void { + if (ws?.readyState === WebSocket.OPEN) { + const closeMsg = JSON.stringify({ type: MSG_SESSION_CLOSE }); + try { + ws.send(closeMsg); + } catch (e: unknown) { + logger.error(e, 'failed to send close message'); + } + } + ws?.close(1000); // signal normal websocket closure + ws = null; + } + + /** + * Reconnect the WebSocket with updated options. + * This is called when options are updated via updateOptions(). + */ + async function reconnect(): Promise { + close(); + // Connection will be re-established on next ensureConnection call + } + + const transport = new TransformStream( + { + async start(controller) { + outputController = controller; + await ensureConnection(); + }, + + transform(chunk, controller) { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + if (!state.overlapSpeechStartedAt) return; + + try { + sendAudioData(chunk); + } catch (err) { + logger.error({ err }, 'Failed to send audio data over WebSocket'); + } + }, + + flush() { + close(); + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); + + return { transport, reconnect }; +} diff --git a/agents/src/llm/llm.ts b/agents/src/llm/llm.ts index 0ab158e6b..40055bd5c 100644 --- a/agents/src/llm/llm.ts +++ b/agents/src/llm/llm.ts @@ -65,6 +65,18 @@ export abstract class LLM extends (EventEmitter as new () => TypedEmitter { } return (usage?.completionTokens || 0) / (durationMs / 1000); })(), + metadata: { + modelProvider: this.#llm.provider, + modelName: this.#llm.model, + }, }; if (this.#llmRequestSpan) { diff --git a/agents/src/llm/realtime.ts b/agents/src/llm/realtime.ts index b1758eaf7..d02d86dab 100644 --- a/agents/src/llm/realtime.ts +++ b/agents/src/llm/realtime.ts @@ -68,6 +68,10 @@ export abstract class RealtimeModel { /** The model name/identifier used by this realtime model */ abstract get model(): string; + get provider(): string { + return 'unknown'; + } + abstract session(): RealtimeSession; abstract close(): Promise; diff --git a/agents/src/metrics/base.ts b/agents/src/metrics/base.ts index 7f6d6a0cc..3c533b949 100644 --- a/agents/src/metrics/base.ts +++ b/agents/src/metrics/base.ts @@ -2,6 +2,13 @@ // // SPDX-License-Identifier: Apache-2.0 +export type MetricsMetadata = { + /** The provider name (e.g., 'openai', 'anthropic'). */ + modelProvider?: string; + /** The model name (e.g., 'gpt-4o', 'claude-3-5-sonnet'). */ + modelName?: string; +}; + export type AgentMetrics = | STTMetrics | LLMMetrics @@ -26,6 +33,8 @@ export type LLMMetrics = { totalTokens: number; tokensPerSecond: number; speechId?: string; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type STTMetrics = { @@ -41,10 +50,16 @@ export type STTMetrics = { * The duration of the pushed audio in milliseconds. */ audioDurationMs: number; + /** Input audio tokens (for token-based billing). */ + inputTokens?: number; + /** Output text tokens (for token-based billing). */ + outputTokens?: number; /** * Whether the STT is streaming (e.g using websocket). */ streamed: boolean; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type TTSMetrics = { @@ -59,10 +74,17 @@ export type TTSMetrics = { /** Generated audio duration in milliseconds. */ audioDurationMs: number; cancelled: boolean; + /** Number of characters synthesized (for character-based billing). */ charactersCount: number; + /** Input text tokens (for token-based billing, e.g., OpenAI TTS). */ + inputTokens?: number; + /** Output audio tokens (for token-based billing, e.g., OpenAI TTS). */ + outputTokens?: number; streamed: boolean; segmentId?: string; speechId?: string; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type VADMetrics = { @@ -133,6 +155,10 @@ export type RealtimeModelMetrics = { * The duration of the response from created to done in milliseconds. */ durationMs: number; + /** + * The duration of the session connection in milliseconds (for session-based billing like xAI). + */ + sessionDurationMs?: number; /** * Time to first audio token in milliseconds. -1 if no audio token was sent. */ @@ -165,4 +191,6 @@ export type RealtimeModelMetrics = { * Details about the output tokens used in the Response. */ outputTokenDetails: RealtimeModelMetricsOutputTokenDetails; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; diff --git a/agents/src/metrics/index.ts b/agents/src/metrics/index.ts index f400a9638..c83a9fbff 100644 --- a/agents/src/metrics/index.ts +++ b/agents/src/metrics/index.ts @@ -6,10 +6,19 @@ export type { AgentMetrics, EOUMetrics, LLMMetrics, + MetricsMetadata, RealtimeModelMetrics, STTMetrics, TTSMetrics, VADMetrics, } from './base.js'; +export { + filterZeroValues, + ModelUsageCollector, + type LLMModelUsage, + type ModelUsage, + type STTModelUsage, + type TTSModelUsage, +} from './model_usage.js'; export { UsageCollector, type UsageSummary } from './usage_collector.js'; export { logMetrics } from './utils.js'; diff --git a/agents/src/metrics/model_usage.test.ts b/agents/src/metrics/model_usage.test.ts new file mode 100644 index 000000000..d2f983beb --- /dev/null +++ b/agents/src/metrics/model_usage.test.ts @@ -0,0 +1,545 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { beforeEach, describe, expect, it } from 'vitest'; +import type { LLMMetrics, RealtimeModelMetrics, STTMetrics, TTSMetrics } from './base.js'; +import { + type LLMModelUsage, + ModelUsageCollector, + type STTModelUsage, + type TTSModelUsage, + filterZeroValues, +} from './model_usage.js'; + +describe('model_usage', () => { + describe('filterZeroValues', () => { + it('should filter out zero values from LLMModelUsage', () => { + const usage: LLMModelUsage = { + type: 'llm_usage', + provider: 'openai', + model: 'gpt-4o', + inputTokens: 100, + inputCachedTokens: 0, + inputAudioTokens: 0, + inputCachedAudioTokens: 0, + inputTextTokens: 0, + inputCachedTextTokens: 0, + inputImageTokens: 0, + inputCachedImageTokens: 0, + outputTokens: 50, + outputAudioTokens: 0, + outputTextTokens: 0, + sessionDurationMs: 0, + }; + + const filtered = filterZeroValues(usage); + + expect(filtered.type).toBe('llm_usage'); + expect(filtered.provider).toBe('openai'); + expect(filtered.model).toBe('gpt-4o'); + expect(filtered.inputTokens).toBe(100); + expect(filtered.outputTokens).toBe(50); + // Zero values should be filtered out + expect(filtered.inputCachedTokens).toBeUndefined(); + expect(filtered.inputAudioTokens).toBeUndefined(); + expect(filtered.sessionDurationMs).toBeUndefined(); + }); + + it('should filter out zero values from TTSModelUsage', () => { + const usage: TTSModelUsage = { + type: 'tts_usage', + provider: 'elevenlabs', + model: 'eleven_turbo_v2', + inputTokens: 0, + outputTokens: 0, + charactersCount: 500, + audioDurationMs: 3000, + }; + + const filtered = filterZeroValues(usage); + + expect(filtered.type).toBe('tts_usage'); + expect(filtered.provider).toBe('elevenlabs'); + expect(filtered.charactersCount).toBe(500); + expect(filtered.audioDurationMs).toBe(3000); + expect(filtered.inputTokens).toBeUndefined(); + expect(filtered.outputTokens).toBeUndefined(); + }); + + it('should keep all values when none are zero', () => { + const usage: STTModelUsage = { + type: 'stt_usage', + provider: 'deepgram', + model: 'nova-2', + inputTokens: 10, + outputTokens: 20, + audioDurationMs: 5000, + }; + + const filtered = filterZeroValues(usage); + + expect(Object.keys(filtered)).toHaveLength(6); + expect(filtered).toEqual(usage); + }); + }); + + describe('ModelUsageCollector', () => { + let collector: ModelUsageCollector; + + beforeEach(() => { + collector = new ModelUsageCollector(); + }); + + describe('collect LLM metrics', () => { + it('should aggregate LLM metrics by provider and model', () => { + const metrics1: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 50, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const metrics2: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 150, + ttftMs: 60, + cancelled: false, + completionTokens: 150, + promptTokens: 300, + promptCachedTokens: 75, + totalTokens: 450, + tokensPerSecond: 12, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.type).toBe('llm_usage'); + expect(llmUsage.provider).toBe('openai'); + expect(llmUsage.model).toBe('gpt-4o'); + expect(llmUsage.inputTokens).toBe(500); // 200 + 300 + expect(llmUsage.inputCachedTokens).toBe(125); // 50 + 75 + expect(llmUsage.outputTokens).toBe(250); // 100 + 150 + }); + + it('should separate metrics by different providers', () => { + const openaiMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const anthropicMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 120, + ttftMs: 55, + cancelled: false, + completionTokens: 80, + promptTokens: 150, + promptCachedTokens: 0, + totalTokens: 230, + tokensPerSecond: 8, + metadata: { + modelProvider: 'anthropic', + modelName: 'claude-3-5-sonnet', + }, + }; + + collector.collect(openaiMetrics); + collector.collect(anthropicMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(2); + + const openaiUsage = usage.find( + (u) => u.type === 'llm_usage' && u.provider === 'openai', + ) as LLMModelUsage; + const anthropicUsage = usage.find( + (u) => u.type === 'llm_usage' && u.provider === 'anthropic', + ) as LLMModelUsage; + + expect(openaiUsage.inputTokens).toBe(200); + expect(openaiUsage.outputTokens).toBe(100); + expect(anthropicUsage.inputTokens).toBe(150); + expect(anthropicUsage.outputTokens).toBe(80); + }); + }); + + describe('collect TTS metrics', () => { + it('should aggregate TTS metrics by provider and model', () => { + const metrics1: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + ttfbMs: 100, + durationMs: 500, + audioDurationMs: 3000, + cancelled: false, + charactersCount: 100, + inputTokens: 10, + outputTokens: 20, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + const metrics2: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + ttfbMs: 120, + durationMs: 600, + audioDurationMs: 4000, + cancelled: false, + charactersCount: 200, + inputTokens: 15, + outputTokens: 25, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const ttsUsage = usage[0] as TTSModelUsage; + expect(ttsUsage.type).toBe('tts_usage'); + expect(ttsUsage.provider).toBe('elevenlabs'); + expect(ttsUsage.model).toBe('eleven_turbo_v2'); + expect(ttsUsage.charactersCount).toBe(300); // 100 + 200 + expect(ttsUsage.audioDurationMs).toBe(7000); // 3000 + 4000 + expect(ttsUsage.inputTokens).toBe(25); // 10 + 15 + expect(ttsUsage.outputTokens).toBe(45); // 20 + 25 + }); + }); + + describe('collect STT metrics', () => { + it('should aggregate STT metrics by provider and model', () => { + const metrics1: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 5000, + inputTokens: 50, + outputTokens: 100, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + const metrics2: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 3000, + inputTokens: 30, + outputTokens: 60, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const sttUsage = usage[0] as STTModelUsage; + expect(sttUsage.type).toBe('stt_usage'); + expect(sttUsage.provider).toBe('deepgram'); + expect(sttUsage.model).toBe('nova-2'); + expect(sttUsage.audioDurationMs).toBe(8000); // 5000 + 3000 + expect(sttUsage.inputTokens).toBe(80); // 50 + 30 + expect(sttUsage.outputTokens).toBe(160); // 100 + 60 + }); + }); + + describe('collect realtime model metrics', () => { + it('should aggregate realtime model metrics with detailed token breakdown', () => { + const metrics: RealtimeModelMetrics = { + type: 'realtime_model_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 1000, + ttftMs: 100, + cancelled: false, + inputTokens: 500, + outputTokens: 300, + totalTokens: 800, + tokensPerSecond: 10, + sessionDurationMs: 5000, + inputTokenDetails: { + audioTokens: 200, + textTokens: 250, + imageTokens: 50, + cachedTokens: 100, + cachedTokensDetails: { + audioTokens: 30, + textTokens: 50, + imageTokens: 20, + }, + }, + outputTokenDetails: { + textTokens: 200, + audioTokens: 100, + imageTokens: 0, + }, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o-realtime', + }, + }; + + collector.collect(metrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.type).toBe('llm_usage'); + expect(llmUsage.provider).toBe('openai'); + expect(llmUsage.model).toBe('gpt-4o-realtime'); + expect(llmUsage.inputTokens).toBe(500); + expect(llmUsage.inputCachedTokens).toBe(100); + expect(llmUsage.inputAudioTokens).toBe(200); + expect(llmUsage.inputCachedAudioTokens).toBe(30); + expect(llmUsage.inputTextTokens).toBe(250); + expect(llmUsage.inputCachedTextTokens).toBe(50); + expect(llmUsage.inputImageTokens).toBe(50); + expect(llmUsage.inputCachedImageTokens).toBe(20); + expect(llmUsage.outputTokens).toBe(300); + expect(llmUsage.outputTextTokens).toBe(200); + expect(llmUsage.outputAudioTokens).toBe(100); + expect(llmUsage.sessionDurationMs).toBe(5000); + }); + }); + + describe('mixed metrics collection', () => { + it('should collect and separate LLM, TTS, and STT metrics', () => { + const llmMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const ttsMetrics: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + ttfbMs: 100, + durationMs: 500, + audioDurationMs: 3000, + cancelled: false, + charactersCount: 100, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + const sttMetrics: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req3', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 5000, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + collector.collect(llmMetrics); + collector.collect(ttsMetrics); + collector.collect(sttMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(3); + + const llmUsage = usage.find((u) => u.type === 'llm_usage'); + const ttsUsage = usage.find((u) => u.type === 'tts_usage'); + const sttUsage = usage.find((u) => u.type === 'stt_usage'); + + expect(llmUsage).toBeDefined(); + expect(ttsUsage).toBeDefined(); + expect(sttUsage).toBeDefined(); + }); + }); + + describe('flatten returns copies', () => { + it('should return deep copies of usage objects', () => { + const metrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + collector.collect(metrics); + + const usage1 = collector.flatten(); + const usage2 = collector.flatten(); + + // Should be equal values + expect(usage1[0]).toEqual(usage2[0]); + + // But not the same object reference + expect(usage1[0]).not.toBe(usage2[0]); + + // Modifying one shouldn't affect the other + (usage1[0] as LLMModelUsage).inputTokens = 9999; + expect((usage2[0] as LLMModelUsage).inputTokens).toBe(200); + }); + }); + + describe('handles missing metadata', () => { + it('should use empty strings when metadata is missing', () => { + const metrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + // No metadata + }; + + collector.collect(metrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.provider).toBe(''); + expect(llmUsage.model).toBe(''); + }); + }); + + describe('ignores VAD and EOU metrics', () => { + it('should not collect VAD metrics', () => { + const vadMetrics = { + type: 'vad_metrics' as const, + label: 'test', + timestamp: Date.now(), + idleTimeMs: 100, + inferenceDurationTotalMs: 50, + inferenceCount: 10, + }; + + collector.collect(vadMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(0); + }); + + it('should not collect EOU metrics', () => { + const eouMetrics = { + type: 'eou_metrics' as const, + timestamp: Date.now(), + endOfUtteranceDelayMs: 100, + transcriptionDelayMs: 50, + onUserTurnCompletedDelayMs: 30, + lastSpeakingTimeMs: Date.now(), + }; + + collector.collect(eouMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(0); + }); + }); + }); +}); diff --git a/agents/src/metrics/model_usage.ts b/agents/src/metrics/model_usage.ts new file mode 100644 index 000000000..d90ed7123 --- /dev/null +++ b/agents/src/metrics/model_usage.ts @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { + AgentMetrics, + LLMMetrics, + RealtimeModelMetrics, + STTMetrics, + TTSMetrics, +} from './base.js'; + +export type LLMModelUsage = { + type: 'llm_usage'; + /** The provider name (e.g., 'openai', 'anthropic'). */ + provider: string; + /** The model name (e.g., 'gpt-4o', 'claude-3-5-sonnet'). */ + model: string; + /** Total input tokens. */ + inputTokens: number; + /** Input tokens served from cache. */ + inputCachedTokens: number; + /** Input audio tokens (for multimodal models). */ + inputAudioTokens: number; + /** Cached input audio tokens. */ + inputCachedAudioTokens: number; + /** Input text tokens. */ + inputTextTokens: number; + /** Cached input text tokens. */ + inputCachedTextTokens: number; + /** Input image tokens (for multimodal models). */ + inputImageTokens: number; + /** Cached input image tokens. */ + inputCachedImageTokens: number; + /** Total output tokens. */ + outputTokens: number; + /** Output audio tokens (for multimodal models). */ + outputAudioTokens: number; + /** Output text tokens. */ + outputTextTokens: number; + /** Total session connection duration in milliseconds (for session-based billing like xAI). */ + sessionDurationMs: number; +}; + +export type TTSModelUsage = { + type: 'tts_usage'; + /** The provider name (e.g., 'elevenlabs', 'cartesia'). */ + provider: string; + /** The model name (e.g., 'eleven_turbo_v2', 'sonic'). */ + model: string; + /** Input text tokens (for token-based TTS billing, e.g., OpenAI TTS). */ + inputTokens: number; + /** Output audio tokens (for token-based TTS billing, e.g., OpenAI TTS). */ + outputTokens: number; + /** Number of characters synthesized (for character-based TTS billing). */ + charactersCount: number; + /** + * Duration of generated audio in milliseconds. + */ + audioDurationMs: number; +}; + +export type STTModelUsage = { + type: 'stt_usage'; + /** The provider name (e.g., 'deepgram', 'assemblyai'). */ + provider: string; + /** The model name (e.g., 'nova-2', 'best'). */ + model: string; + /** Input audio tokens (for token-based STT billing). */ + inputTokens: number; + /** Output text tokens (for token-based STT billing). */ + outputTokens: number; + /** Duration of processed audio in milliseconds. */ + audioDurationMs: number; +}; + +export type ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage; + +export function filterZeroValues(usage: T): Partial { + const result: Partial = {} as Partial; + for (const [key, value] of Object.entries(usage)) { + if (value !== 0 && value !== 0.0) { + (result as Record)[key] = value; + } + } + return result; +} + +export class ModelUsageCollector { + private llmUsage: Map = new Map(); + private ttsUsage: Map = new Map(); + private sttUsage: Map = new Map(); + + /** Extract provider and model from metrics metadata. */ + private extractProviderModel( + metrics: LLMMetrics | STTMetrics | TTSMetrics | RealtimeModelMetrics, + ): [string, string] { + let provider = ''; + let model = ''; + if (metrics.metadata) { + provider = metrics.metadata.modelProvider || ''; + model = metrics.metadata.modelName || ''; + } + return [provider, model]; + } + + /** Get or create an LLMModelUsage for the given provider/model combination. */ + private getLLMUsage(provider: string, model: string): LLMModelUsage { + const key = `${provider}:${model}`; + let usage = this.llmUsage.get(key); + if (!usage) { + usage = { + type: 'llm_usage', + provider, + model, + inputTokens: 0, + inputCachedTokens: 0, + inputAudioTokens: 0, + inputCachedAudioTokens: 0, + inputTextTokens: 0, + inputCachedTextTokens: 0, + inputImageTokens: 0, + inputCachedImageTokens: 0, + outputTokens: 0, + outputAudioTokens: 0, + outputTextTokens: 0, + sessionDurationMs: 0, + }; + this.llmUsage.set(key, usage); + } + return usage; + } + + /** Get or create a TTSModelUsage for the given provider/model combination. */ + private getTTSUsage(provider: string, model: string): TTSModelUsage { + const key = `${provider}:${model}`; + let usage = this.ttsUsage.get(key); + if (!usage) { + usage = { + type: 'tts_usage', + provider, + model, + inputTokens: 0, + outputTokens: 0, + charactersCount: 0, + audioDurationMs: 0, + }; + this.ttsUsage.set(key, usage); + } + return usage; + } + + /** Get or create an STTModelUsage for the given provider/model combination. */ + private getSTTUsage(provider: string, model: string): STTModelUsage { + const key = `${provider}:${model}`; + let usage = this.sttUsage.get(key); + if (!usage) { + usage = { + type: 'stt_usage', + provider, + model, + inputTokens: 0, + outputTokens: 0, + audioDurationMs: 0, + }; + this.sttUsage.set(key, usage); + } + return usage; + } + + /** Collect metrics and aggregate usage by model/provider. */ + collect(metrics: AgentMetrics): void { + if (metrics.type === 'llm_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getLLMUsage(provider, model); + usage.inputTokens += metrics.promptTokens; + usage.inputCachedTokens += metrics.promptCachedTokens; + usage.outputTokens += metrics.completionTokens; + } else if (metrics.type === 'realtime_model_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getLLMUsage(provider, model); + usage.inputTokens += metrics.inputTokens; + usage.inputCachedTokens += metrics.inputTokenDetails.cachedTokens; + + usage.inputTextTokens += metrics.inputTokenDetails.textTokens; + usage.inputCachedTextTokens += metrics.inputTokenDetails.cachedTokensDetails?.textTokens ?? 0; + usage.inputImageTokens += metrics.inputTokenDetails.imageTokens; + usage.inputCachedImageTokens += + metrics.inputTokenDetails.cachedTokensDetails?.imageTokens ?? 0; + usage.inputAudioTokens += metrics.inputTokenDetails.audioTokens; + usage.inputCachedAudioTokens += + metrics.inputTokenDetails.cachedTokensDetails?.audioTokens ?? 0; + + usage.outputTextTokens += metrics.outputTokenDetails.textTokens; + usage.outputAudioTokens += metrics.outputTokenDetails.audioTokens; + usage.outputTokens += metrics.outputTokens; + usage.sessionDurationMs += metrics.sessionDurationMs ?? 0; + } else if (metrics.type === 'tts_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const ttsUsage = this.getTTSUsage(provider, model); + ttsUsage.inputTokens += metrics.inputTokens ?? 0; + ttsUsage.outputTokens += metrics.outputTokens ?? 0; + ttsUsage.charactersCount += metrics.charactersCount; + ttsUsage.audioDurationMs += metrics.audioDurationMs; + } else if (metrics.type === 'stt_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const sttUsage = this.getSTTUsage(provider, model); + sttUsage.inputTokens += metrics.inputTokens ?? 0; + sttUsage.outputTokens += metrics.outputTokens ?? 0; + sttUsage.audioDurationMs += metrics.audioDurationMs; + } + // VAD and EOU metrics are not aggregated for usage tracking. + } + + flatten(): ModelUsage[] { + const result: ModelUsage[] = []; + for (const u of this.llmUsage.values()) { + result.push({ ...u }); + } + for (const u of this.ttsUsage.values()) { + result.push({ ...u }); + } + for (const u of this.sttUsage.values()) { + result.push({ ...u }); + } + return result; + } +} diff --git a/agents/src/metrics/usage_collector.ts b/agents/src/metrics/usage_collector.ts index c7f0e6c3d..c815c8394 100644 --- a/agents/src/metrics/usage_collector.ts +++ b/agents/src/metrics/usage_collector.ts @@ -1,8 +1,13 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import { log } from '../log.js'; import type { AgentMetrics } from './base.js'; +/** + * @deprecated Use LLMModelUsage, TTSModelUsage, or STTModelUsage instead. + * These new types provide per-model/provider usage aggregation for more detailed tracking. + */ export interface UsageSummary { llmPromptTokens: number; llmPromptCachedTokens: number; @@ -11,10 +16,16 @@ export interface UsageSummary { sttAudioDurationMs: number; } +/** + * @deprecated Use ModelUsageCollector instead. + * ModelUsageCollector provides per-model/provider usage aggregation for more detailed tracking. + */ export class UsageCollector { private summary: UsageSummary; + private logger = log(); constructor() { + this.logger.warn('UsageCollector is deprecated. Use ModelUsageCollector instead.'); this.summary = { llmPromptTokens: 0, llmPromptCachedTokens: 0, diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 1fb68bab2..edaeaa856 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -4,14 +4,16 @@ import type { ReadableStream } from 'node:stream/web'; import { IdentityTransform } from './identity_transform.js'; -export interface StreamChannel { +export interface StreamChannel { write(chunk: T): Promise; close(): Promise; stream(): ReadableStream; + abort(error: E): Promise; readonly closed: boolean; + addStreamInput(stream: ReadableStream): void; } -export function createStreamChannel(): StreamChannel { +export function createStreamChannel(): StreamChannel { const transform = new IdentityTransform(); const writer = transform.writable.getWriter(); let isClosed = false; @@ -19,6 +21,36 @@ export function createStreamChannel(): StreamChannel { return { write: (chunk: T) => writer.write(chunk), stream: () => transform.readable, + abort: async (error: E) => { + if (isClosed) return; + isClosed = true; + try { + await writer.abort(error); + } catch (e) { + if (e instanceof Error && e.name === 'TypeError') return; + throw e; + } + }, + addStreamInput: (newInputStream) => { + if (isClosed) return; + const reader = newInputStream.getReader(); + (async () => { + try { + while (!isClosed) { + const { done, value } = await reader.read(); + if (done) break; + await writer.write(value); + } + } catch (err) { + if (!isClosed) { + isClosed = true; + await writer.abort(err as E); + } + } finally { + reader.releaseLock(); + } + })().catch(() => {}); + }, close: async () => { try { const result = await writer.close(); diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts index 48c689ba2..523689d5e 100644 --- a/agents/src/stt/stt.ts +++ b/agents/src/stt/stt.ts @@ -59,6 +59,10 @@ export interface SpeechData { export interface RecognitionUsage { audioDuration: number; + /** Input audio tokens (for token-based STT billing). */ + inputTokens?: number; + /** Output text tokens (for token-based STT billing). */ + outputTokens?: number; } /** SpeechEvent is a packet of speech-to-text data. */ @@ -121,6 +125,30 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter { const startTime = process.hrtime.bigint(); @@ -134,6 +162,10 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter durationMs: 0, label: this.#stt.label, audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000), + inputTokens: event.recognitionUsage!.inputTokens ?? 0, + outputTokens: event.recognitionUsage!.outputTokens ?? 0, streamed: true, + metadata: { + modelProvider: this.#stt.provider, + modelName: this.#stt.model, + }, }; this.#stt.emit('metrics_collected', metrics); } diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index db76f7bc1..3a0afbd0a 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -30,6 +30,10 @@ export const ATTR_FUNCTION_TOOLS = 'lk.function_tools'; export const ATTR_RESPONSE_TEXT = 'lk.response.text'; export const ATTR_RESPONSE_FUNCTION_CALLS = 'lk.response.function_calls'; +// New latency attributes for response timing +/** Time to first token in seconds. */ +export const ATTR_RESPONSE_TTFT = 'lk.response.ttft'; + // function tool export const ATTR_FUNCTION_TOOL_NAME = 'lk.function_tool.name'; export const ATTR_FUNCTION_TOOL_ARGS = 'lk.function_tool.arguments'; @@ -41,6 +45,9 @@ export const ATTR_TTS_INPUT_TEXT = 'lk.input_text'; export const ATTR_TTS_STREAMING = 'lk.tts.streaming'; export const ATTR_TTS_LABEL = 'lk.tts.label'; +/** Time to first byte in seconds. */ +export const ATTR_RESPONSE_TTFB = 'lk.response.ttfb'; + // eou detection export const ATTR_EOU_PROBABILITY = 'lk.eou.probability'; export const ATTR_EOU_UNLIKELY_THRESHOLD = 'lk.eou.unlikely_threshold'; @@ -51,15 +58,27 @@ export const ATTR_TRANSCRIPT_CONFIDENCE = 'lk.transcript_confidence'; export const ATTR_TRANSCRIPTION_DELAY = 'lk.transcription_delay'; export const ATTR_END_OF_TURN_DELAY = 'lk.end_of_turn_delay'; +// Adaptive Interruption attributes +export const ATTR_IS_INTERRUPTION = 'lk.is_interruption'; +export const ATTR_INTERRUPTION_PROBABILITY = 'lk.interruption.probability'; +export const ATTR_INTERRUPTION_TOTAL_DURATION = 'lk.interruption.total_duration'; +export const ATTR_INTERRUPTION_PREDICTION_DURATION = 'lk.interruption.prediction_duration'; +export const ATTR_INTERRUPTION_DETECTION_DELAY = 'lk.interruption.detection_delay'; + // metrics export const ATTR_LLM_METRICS = 'lk.llm_metrics'; export const ATTR_TTS_METRICS = 'lk.tts_metrics'; export const ATTR_REALTIME_MODEL_METRICS = 'lk.realtime_model_metrics'; +/** End-to-end latency in seconds. */ +export const ATTR_E2E_LATENCY = 'lk.e2e_latency'; + // OpenTelemetry GenAI attributes // OpenTelemetry specification: https://opentelemetry.io/docs/specs/semconv/registry/attributes/gen-ai/ export const ATTR_GEN_AI_OPERATION_NAME = 'gen_ai.operation.name'; export const ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model'; +/** The provider name (e.g., 'openai', 'anthropic'). */ +export const ATTR_GEN_AI_PROVIDER_NAME = 'gen_ai.provider.name'; export const ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens'; export const ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens'; diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts index 28ef4c746..6f39ba427 100644 --- a/agents/src/telemetry/traces.ts +++ b/agents/src/telemetry/traces.ts @@ -24,6 +24,7 @@ import { AccessToken } from 'livekit-server-sdk'; import fs from 'node:fs/promises'; import type { ChatContent, ChatItem } from '../llm/index.js'; import { enableOtelLogging } from '../log.js'; +import { filterZeroValues } from '../metrics/model_usage.js'; import type { SessionReport } from '../voice/report.js'; import { type SimpleLogRecord, SimpleOTLPHttpLogExporter } from './otel_http_exporter.js'; import { flushPinoLogs, initPinoCloudExporter } from './pino_otel_transport.js'; @@ -445,6 +446,8 @@ export async function uploadSessionReport(options: { 'logger.name': 'chat_history', }; + const usage = report.modelUsage?.map(filterZeroValues) || null; + logRecords.push({ body: 'session report', timestampMs: report.startedAt || report.timestamp || 0, @@ -453,6 +456,7 @@ export async function uploadSessionReport(options: { 'session.options': report.options || {}, 'session.report_timestamp': report.timestamp, agent_name: agentName, + usage, }, }); diff --git a/agents/src/tts/tts.ts b/agents/src/tts/tts.ts index 8ee46515a..2595451da 100644 --- a/agents/src/tts/tts.ts +++ b/agents/src/tts/tts.ts @@ -87,6 +87,30 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter; #ttsRequestSpan?: Span; + #inputTokens = 0; + #outputTokens = 0; constructor(tts: TTS, connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS) { this.#tts = tts; @@ -275,6 +301,18 @@ export abstract class SynthesizeStream } } + /** + * Set token usage for token-based TTS billing (e.g., OpenAI TTS). + * Plugins should call this method to report token usage. + */ + protected setTokenUsage({ + inputTokens = 0, + outputTokens = 0, + }: { inputTokens?: number; outputTokens?: number } = {}): void { + this.#inputTokens = inputTokens; + this.#outputTokens = outputTokens; + } + protected async monitorMetrics() { const startTime = process.hrtime.bigint(); let audioDurationMs = 0; @@ -296,12 +334,22 @@ export abstract class SynthesizeStream audioDurationMs: roundedAudioDurationMs, cancelled: this.abortController.signal.aborted, label: this.#tts.label, - streamed: false, + inputTokens: this.#inputTokens, + outputTokens: this.#outputTokens, + streamed: true, + metadata: { + modelProvider: this.#tts.provider, + modelName: this.#tts.model, + }, }; if (this.#ttsRequestSpan) { this.#ttsRequestSpan.setAttribute(traceTypes.ATTR_TTS_METRICS, JSON.stringify(metrics)); } this.#tts.emit('metrics_collected', metrics); + + // Reset token usage after emitting metrics for the next segment + this.#inputTokens = 0; + this.#outputTokens = 0; } }; @@ -425,6 +473,8 @@ export abstract class ChunkedStream implements AsyncIterableIterator(); export const STOP_RESPONSE_SYMBOL = Symbol('StopResponse'); @@ -63,6 +65,7 @@ export interface AgentOptions { instructions: string; chatCtx?: ChatContext; tools?: ToolContext; + /** @deprecated use turnHandling instead */ turnDetection?: TurnDetectionMode; stt?: STT | STTModelString; vad?: VAD; @@ -70,6 +73,7 @@ export interface AgentOptions { tts?: TTS | TTSModelString; allowInterruptions?: boolean; minConsecutiveSpeechDelay?: number; + turnHandling?: TurnHandlingConfig; } export class Agent { @@ -79,6 +83,9 @@ export class Agent { private _vad?: VAD; private _llm?: LLM | RealtimeModel; private _tts?: TTS; + private turnHandling?: TurnHandlingConfig; + private _interruptionDetection: InterruptionConfig['mode']; + private _allowInterruptions?: boolean; /** @internal */ _agentActivity?: AgentActivity; @@ -92,17 +99,8 @@ export class Agent { /** @internal */ _tools?: ToolContext; - constructor({ - id, - instructions, - chatCtx, - tools, - turnDetection, - stt, - vad, - llm, - tts, - }: AgentOptions) { + constructor(options: AgentOptions) { + const { id, instructions, chatCtx, tools, stt, vad, llm, tts, turnHandling } = options; if (id) { this._id = id; } else { @@ -126,7 +124,9 @@ export class Agent { }) : ChatContext.empty(); - this.turnDetection = turnDetection; + this.turnHandling = turnHandling; // TODO migrate legacy options to new turn handling config when turnConfig is unset + + this.turnDetection = this.turnHandling?.turnDetection; this._vad = vad; if (typeof stt === 'string') { @@ -147,6 +147,11 @@ export class Agent { this._tts = tts; } + this._interruptionDetection = this.turnHandling?.interruption.mode; + if (this.turnHandling?.interruption.mode !== undefined) { + this._allowInterruptions = !!this.turnHandling.interruption.mode; + } + this._agentActivity = undefined; } @@ -186,6 +191,14 @@ export class Agent { return this.getActivityOrThrow().agentSession as AgentSession; } + get interruptionDetection(): InterruptionConfig['mode'] { + return this._interruptionDetection; + } + + get allowInterruptions(): boolean | undefined { + return this._allowInterruptions; + } + async onEnter(): Promise {} async onExit(): Promise {} diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 2cc66449a..54efa2977 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -8,6 +8,8 @@ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api' import { Heap } from 'heap-js'; import { AsyncLocalStorage } from 'node:async_hooks'; import { ReadableStream } from 'node:stream/web'; +import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; +import type { InterruptionEvent } from '../inference/interruption/types.js'; import { type ChatContext, ChatMessage } from '../llm/chat_context.js'; import { type ChatItem, @@ -50,7 +52,6 @@ import { type EndOfTurnInfo, type PreemptiveGenerationInfo, type RecognitionHooks, - type _TurnDetector, } from './audio_recognition.js'; import { AgentSessionEventTypes, @@ -86,13 +87,14 @@ interface PreemptiveGeneration { createdAt: number; } +// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes export class AgentActivity implements RecognitionHooks { private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000; private started = false; private audioRecognition?: AudioRecognition; private realtimeSession?: RealtimeSession; private realtimeSpans?: Map; // Maps response_id to OTEL span for metrics recording - private turnDetectionMode?: Exclude; + private turnDetectionMode?: TurnDetectionMode; private logger = log(); private _draining = false; private _currentSpeech?: SpeechHandle; @@ -104,6 +106,10 @@ export class AgentActivity implements RecognitionHooks { // default to null as None, which maps to the default provider tool choice value private toolChoice: ToolChoice | null = null; private _preemptiveGeneration?: PreemptiveGeneration; + private interruptionDetector?: AdaptiveInterruptionDetector; + private isInterruptionDetectionEnabled: boolean; + private isInterruptionByAudioActivityEnabled: boolean; + private isDefaultInterruptionByAudioActivityEnabled: boolean; agent: Agent; agentSession: AgentSession; @@ -204,6 +210,16 @@ export class AgentActivity implements RecognitionHooks { 'for more responsive interruption handling.', ); } + + this.interruptionDetector = this.resolveInterruptionDetector(); + this.isInterruptionDetectionEnabled = !!this.interruptionDetector; + + // this allows taking over audio interruption temporarily until interruption is detected + // by default is is ture unless turnDetection is manual or realtime_llm + this.isInterruptionByAudioActivityEnabled = + this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm'; + + this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled; } async start(): Promise { @@ -295,9 +311,12 @@ export class AgentActivity implements RecognitionHooks { vad: this.vad, turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, - minEndpointingDelay: this.agentSession.options.minEndpointingDelay, - maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, + interruptionDetection: this.interruptionDetector, + minEndpointingDelay: this.agentSession.options.turnHandling.endpointing.minDelay, + maxEndpointingDelay: this.agentSession.options.turnHandling.endpointing.maxDelay, rootSpanContext: this.agentSession.rootSpanContext, + sttModel: this.stt?.model, + sttProvider: this.stt?.provider, }); this.audioRecognition.start(); this.started = true; @@ -356,7 +375,7 @@ export class AgentActivity implements RecognitionHooks { get allowInterruptions(): boolean { // TODO(AJS-51): Allow options to be defined in Agent class - return this.agentSession.options.allowInterruptions; + return this.agentSession.options.turnHandling.interruption?.mode !== false; } get turnDetection(): TurnDetectionMode | undefined { @@ -385,7 +404,13 @@ export class AgentActivity implements RecognitionHooks { } } - updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void { + updateOptions({ + toolChoice, + turnDetection, + }: { + toolChoice?: ToolChoice | null; + turnDetection?: TurnDetectionMode; + }): void { if (toolChoice !== undefined) { this.toolChoice = toolChoice; } @@ -393,6 +418,10 @@ export class AgentActivity implements RecognitionHooks { if (this.realtimeSession) { this.realtimeSession.updateOptions({ toolChoice: this.toolChoice }); } + + this.turnDetectionMode = turnDetection; // TODO fix types + this.isDefaultInterruptionByAudioActivityEnabled = + this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm'; } attachAudioInput(audioStream: ReadableStream): void { @@ -549,6 +578,9 @@ export class AgentActivity implements RecognitionHooks { if (!this.vad) { this.agentSession._updateUserState('speaking'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfOverlapSpeech(0, this.agentSession._userSpeakingSpan); + } } // this.interrupt() is going to raise when allow_interruptions is False, @@ -567,6 +599,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfOverlapSpeech(this.agentSession._userSpeakingSpan); + } this.agentSession._updateUserState('listening'); } @@ -644,6 +679,12 @@ export class AgentActivity implements RecognitionHooks { speechStartTime = speechStartTime - ev.speechDuration; } this.agentSession._updateUserState('speaking', speechStartTime); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfOverlapSpeech( + ev.speechDuration, + this.agentSession._userSpeakingSpan, + ); + } } onEndOfSpeech(ev: VADEvent): void { @@ -651,6 +692,9 @@ export class AgentActivity implements RecognitionHooks { if (ev) { speechEndTime = speechEndTime - ev.silenceDuration; } + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfOverlapSpeech(this.agentSession._userSpeakingSpan); + } this.agentSession._updateUserState('listening', speechEndTime); } @@ -660,12 +704,16 @@ export class AgentActivity implements RecognitionHooks { return; } - if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) { + if (ev.speechDuration >= this.agentSession.options.turnHandling.interruption?.minDuration) { this.interruptByAudioActivity(); } } private interruptByAudioActivity(): void { + if (!this.isInterruptionByAudioActivityEnabled) { + return; + } + if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) { // skip speech handle interruption if server side turn detection is enabled return; @@ -675,7 +723,11 @@ export class AgentActivity implements RecognitionHooks { // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0 // - Apply check to all STT results: empty string, undefined, or any length // - This ensures consistent behavior across all interruption scenarios - if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) { + if ( + this.stt && + this.agentSession.options.turnHandling.interruption?.minWords > 0 && + this.audioRecognition + ) { const text = this.audioRecognition.currentTranscript; // TODO(shubhra): better word splitting for multi-language @@ -685,7 +737,7 @@ export class AgentActivity implements RecognitionHooks { // Only allow interruption if word count meets or exceeds minInterruptionWords // This applies to all cases: empty strings, partial speech, and full speech - if (wordCount < this.agentSession.options.minInterruptionWords) { + if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) { return; } } @@ -706,6 +758,14 @@ export class AgentActivity implements RecognitionHooks { } } + onInterruption(ev: InterruptionEvent) { + this.restoreInterruptionByAudioActivity(); + this.interruptByAudioActivity(); + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(ev.overlapSpeechStartedAt || ev.timestamp); + } + } + onInterimTranscript(ev: SpeechEvent): void { if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) { // skip stt transcription if userTranscription is enabled on the realtime model @@ -852,16 +912,16 @@ export class AgentActivity implements RecognitionHooks { this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && - this.agentSession.options.minInterruptionWords > 0 + this.agentSession.options.turnHandling.interruption?.minWords > 0 ) { const wordCount = splitWords(info.newTranscript, true).length; - if (wordCount < this.agentSession.options.minInterruptionWords) { + if (wordCount < this.agentSession.options.turnHandling.interruption?.minWords) { // avoid interruption if the new_transcript contains fewer words than minInterruptionWords this.cancelPreemptiveGeneration(); this.logger.info( { wordCount, - minInterruptionWords: this.agentSession.options.minInterruptionWords, + minInterruptionWords: this.agentSession.options.turnHandling.interruption.minWords, }, 'skipping user input, word count below minimum interruption threshold', ); @@ -1246,6 +1306,10 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(); + this.isInterruptionByAudioActivityEnabled = false; + } }; if (!audioOutput) { @@ -1263,6 +1327,8 @@ export class AgentActivity implements RecognitionHooks { audioSource, modelSettings, replyAbortController, + this.tts?.model, + this.tts?.provider, ); tasks.push(ttsTask); @@ -1315,6 +1381,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + this.restoreInterruptionByAudioActivity(); } } @@ -1385,6 +1455,8 @@ export class AgentActivity implements RecognitionHooks { toolCtx, modelSettings, replyAbortController, + this.llm?.model, + this.llm?.provider, ); tasks.push(llmTask); @@ -1401,6 +1473,8 @@ export class AgentActivity implements RecognitionHooks { ttsTextInput, modelSettings, replyAbortController, + this.tts?.model, + this.tts?.provider, ); tasks.push(ttsTask); } else { @@ -1445,6 +1519,10 @@ export class AgentActivity implements RecognitionHooks { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(); + this.isInterruptionByAudioActivityEnabled = false; + } }; let audioOut: _AudioOut | null = null; @@ -1568,6 +1646,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + this.restoreInterruptionByAudioActivity(); + } } this.logger.info( @@ -1602,6 +1684,12 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + this.restoreInterruptionByAudioActivity(); + } + } } // mark the playout done before waiting for the tool execution @@ -1889,6 +1977,8 @@ export class AgentActivity implements RecognitionHooks { ttsTextInput, modelSettings, abortController, + this.tts?.model, + this.tts?.provider, ); tasks.push(ttsTask); realtimeAudioResult = ttsStream; @@ -2375,6 +2465,55 @@ export class AgentActivity implements RecognitionHooks { unlock(); } } + + private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined { + const interruptionDetection = + this.agent.interruptionDetection ?? this.agentSession.interruptionDetection; + if ( + !( + this.stt && + this.stt.capabilities.alignedTranscript && + this.stt.capabilities.streaming && + this.vad && + this.turnDetection !== 'manual' && + this.turnDetection !== 'realtime_llm' && + !(this.llm instanceof RealtimeModel) + ) + ) { + if ( + typeof interruptionDetection === 'string' && + ['adaptive', 'vad'].includes(interruptionDetection) + ) { + this.logger.warn( + "interruption_detection is provided, but it's not compatible with the current configuration and will be disabled", + ); + return undefined; + } + } + + if ( + (interruptionDetection !== undefined && interruptionDetection === false) || + interruptionDetection === 'vad' + ) { + return undefined; + } + + const detector = new AdaptiveInterruptionDetector(); + + // TODO cleanup these listeners + detector.on('user_interruption_detected', (ev) => + this.agentSession.emit(AgentSessionEventTypes.UserInterruptionDetected, ev), + ); + detector.on('user_non_interruption_detected', (ev) => + this.agentSession.emit(AgentSessionEventTypes.UserNonInterruptionDetected, ev), + ); + + return detector; + } + + private restoreInterruptionByAudioActivity(): void { + this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled; + } } function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined { diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 25c26391e..f81b40d60 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -15,12 +15,14 @@ import { type STTModelString, type TTSModelString, } from '../inference/index.js'; +import type { InterruptionEvent } from '../inference/interruption/types.js'; import { type JobContext, getJobContext } from '../job.js'; import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js'; import type { LLMError } from '../llm/llm.js'; import { log } from '../log.js'; +import { type ModelUsage, ModelUsageCollector } from '../metrics/model_usage.js'; import type { STT } from '../stt/index.js'; import type { STTError } from '../stt/stt.js'; import { traceTypes, tracer } from '../telemetry/index.js'; @@ -62,30 +64,67 @@ import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io import type { UnknownUserData } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; import { RunResult } from './testing/run_result.js'; +import type { InterruptionConfig } from './turn_config/interruption.js'; +import type { + InternalTurnHandlingConfig, + TurnHandlingConfig, +} from './turn_config/turn_handling.js'; +import { migrateLegacyOptions } from './turn_config/utils.js'; + +export interface AgentSessionUsage { + /** List of usage summaries, one per model/provider combination. */ + modelUsage: ModelUsage[]; +} -export interface VoiceOptions { - allowInterruptions: boolean; - discardAudioIfUninterruptible: boolean; - minInterruptionDuration: number; - minInterruptionWords: number; - minEndpointingDelay: number; - maxEndpointingDelay: number; +export interface SessionOptions { maxToolSteps: number; + /** + * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected. + * When `true`, the agent sends inference calls as soon as a user transcript is received rather + * than waiting for a definitive turn boundary. This can reduce response latency by overlapping + * model inference with user audio, but may incur extra compute if the user interrupts or + * revises mid-utterance. + * @defaultValue false + */ preemptiveGeneration: boolean; - userAwayTimeout?: number | null; + /** + * If set, set the user state as "away" after this amount of time after user and agent are + * silent. Set to `undefined` to disable. + * @defaultValue 15.0 + */ + userAwayTimeout: number | null; + /** + * Configuration for turn handling. + */ + turnHandling: Partial; + + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.mode instead. */ + allowInterruptions?: boolean; + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */ + discardAudioIfUninterruptible?: boolean; + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.minDuration instead. */ + minInterruptionDuration?: number; + /** @deprecated Use {@link SessionOptions.turnHandling}.interruption.minWords instead. */ + minInterruptionWords?: number; + /** @deprecated Use {@link SessionOptions.turnHandling}.endpointing.minDelay instead. */ + minEndpointingDelay?: number; + /** @deprecated Use {@link SessionOptions.turnHandling}.endpointing.maxDelay instead. */ + maxEndpointingDelay?: number; } -const defaultVoiceOptions: VoiceOptions = { - allowInterruptions: true, - discardAudioIfUninterruptible: true, - minInterruptionDuration: 500, - minInterruptionWords: 0, - minEndpointingDelay: 500, - maxEndpointingDelay: 6000, +export interface InternalSessionOptions extends SessionOptions { + turnHandling: InternalTurnHandlingConfig; +} + +export const defaultSessionOptions = { maxToolSteps: 3, preemptiveGeneration: false, userAwayTimeout: 15.0, -} as const; + turnHandling: {}, +} as const satisfies SessionOptions; + +/** @deprecated {@link VoiceOptions} has been renamed to {@link SessionOptions} */ +export type VoiceOptions = SessionOptions; export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector; @@ -99,17 +138,23 @@ export type AgentSessionCallbacks = { [AgentSessionEventTypes.SpeechCreated]: (ev: SpeechCreatedEvent) => void; [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void; [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void; + [AgentSessionEventTypes.UserInterruptionDetected]: (ev: InterruptionEvent) => void; + [AgentSessionEventTypes.UserNonInterruptionDetected]: (ev: InterruptionEvent) => void; }; export type AgentSessionOptions = { - turnDetection?: TurnDetectionMode; stt?: STT | STTModelString; vad?: VAD; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; userData?: UserData; - voiceOptions?: Partial; + options?: Partial; connOptions?: SessionConnectOptions; + + /** @deprecated use {@link AgentSessionOptions.options}.turnHandling.turnDetection instead */ + turnDetection?: TurnDetectionMode; + /** @deprecated use {@link AgentSessionOptions.options} instead */ + voiceOptions?: Partial; }; export class AgentSession< @@ -121,7 +166,7 @@ export class AgentSession< tts?: TTS; turnDetection?: TurnDetectionMode; - readonly options: VoiceOptions; + readonly options: InternalSessionOptions; private agent?: Agent; private activity?: AgentActivity; @@ -150,9 +195,12 @@ export class AgentSession< private ttsErrorCounts = 0; private sessionSpan?: Span; - private userSpeakingSpan?: Span; private agentSpeakingSpan?: Span; + private _interruptionDetection?: InterruptionConfig['mode']; + + private _usageCollector: ModelUsageCollector = new ModelUsageCollector(); + /** @internal */ _recorderIO?: RecorderIO; @@ -171,20 +219,15 @@ export class AgentSession< /** @internal - Current run state for testing */ _globalRunState?: RunResult; - constructor(opts: AgentSessionOptions) { + /** @internal */ + _userSpeakingSpan?: Span; + + constructor(options: AgentSessionOptions) { super(); - const { - vad, - stt, - llm, - tts, - turnDetection, - userData, - voiceOptions = defaultVoiceOptions, - connOptions, - } = opts; + const opts = migrateLegacyOptions(options); + const { vad, stt, llm, tts, userData, connOptions, options: sessionOptions } = opts; // Merge user-provided connOptions with defaults this._connOptions = { sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions }, @@ -215,7 +258,8 @@ export class AgentSession< this.tts = tts; } - this.turnDetection = turnDetection; + this.turnDetection = sessionOptions?.turnHandling?.turnDetection; + this._interruptionDetection = sessionOptions?.turnHandling?.interruption?.mode; this._userData = userData; // configurable IO @@ -224,7 +268,8 @@ export class AgentSession< // This is the "global" chat context, it holds the entire conversation history this._chatCtx = ChatContext.empty(); - this.options = { ...defaultVoiceOptions, ...voiceOptions }; + + this.options = opts.options; this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this); this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed); @@ -236,6 +281,9 @@ export class AgentSession< ): boolean { const eventData = args[0] as AgentEvent; this._recordedEvents.push(eventData); + if (event === AgentSessionEventTypes.MetricsCollected) { + this._usageCollector.collect((eventData as MetricsCollectedEvent).metrics); + } return super.emit(event, ...args); } @@ -264,6 +312,17 @@ export class AgentSession< return this._connOptions; } + get interruptionDetection() { + return this._interruptionDetection; + } + + /** + * Returns usage summaries for this session, one per model/provider combination. + */ + get usage(): AgentSessionUsage { + return { modelUsage: this._usageCollector.flatten() }; + } + set userData(value: UserData) { this._userData = value; } @@ -725,8 +784,8 @@ export class AgentSession< return; } - if (state === 'speaking' && this.userSpeakingSpan === undefined) { - this.userSpeakingSpan = tracer.startSpan({ + if (state === 'speaking' && this._userSpeakingSpan === undefined) { + this._userSpeakingSpan = tracer.startSpan({ name: 'user_speaking', context: this.rootSpanContext, startTime: lastSpeakingTime, @@ -734,9 +793,9 @@ export class AgentSession< // TODO(brian): PR4 - Set participant attributes if roomIO.linkedParticipant is available // (Ref: Python agent_session.py line 1192-1195) - } else if (this.userSpeakingSpan !== undefined) { - this.userSpeakingSpan.end(lastSpeakingTime); - this.userSpeakingSpan = undefined; + } else if (this._userSpeakingSpan !== undefined) { + this._userSpeakingSpan.end(lastSpeakingTime); + this._userSpeakingSpan = undefined; } const oldState = this.userState; @@ -867,9 +926,9 @@ export class AgentSession< this.sessionSpan = undefined; } - if (this.userSpeakingSpan) { - this.userSpeakingSpan.end(); - this.userSpeakingSpan = undefined; + if (this._userSpeakingSpan) { + this._userSpeakingSpan.end(); + this._userSpeakingSpan = undefined; } if (this.agentSpeakingSpan) { @@ -886,6 +945,7 @@ export class AgentSession< this.rootSpanContext = undefined; this.llmErrorCounts = 0; this.ttsErrorCounts = 0; + this._usageCollector = new ModelUsageCollector(); this.logger.info({ reason, error }, 'AgentSession closed'); } diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 25d430684..a4a3ea370 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -5,14 +5,22 @@ import { AudioFrame } from '@livekit/rtc-node'; import type { Context, Span } from '@opentelemetry/api'; import type { WritableStreamDefaultWriter } from 'node:stream/web'; import { ReadableStream } from 'node:stream/web'; +import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; +import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js'; +import { + type InterruptionEvent, + InterruptionEventType, + type InterruptionSentinel, +} from '../inference/interruption/types.js'; import { type ChatContext } from '../llm/chat_context.js'; import { log } from '../log.js'; import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js'; import { IdentityTransform } from '../stream/identity_transform.js'; import { mergeReadableStreams } from '../stream/merge_readable_streams.js'; +import { type StreamChannel, createStreamChannel } from '../stream/stream_channel.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { traceTypes, tracer } from '../telemetry/index.js'; -import { Task, delay } from '../utils.js'; +import { Task, delay, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; import type { STTNode } from './io.js'; @@ -32,6 +40,7 @@ export interface PreemptiveGenerationInfo { } export interface RecognitionHooks { + onInterruption: (ev: InterruptionEvent) => void; onStartOfSpeech: (ev: VADEvent) => void; onVADInferenceDone: (ev: VADEvent) => void; onEndOfSpeech: (ev: VADEvent) => void; @@ -54,22 +63,28 @@ export interface AudioRecognitionOptions { stt?: STTNode; vad?: VAD; turnDetector?: _TurnDetector; - turnDetectionMode?: Exclude; + turnDetectionMode?: TurnDetectionMode; + interruptionDetection?: AdaptiveInterruptionDetector; minEndpointingDelay: number; maxEndpointingDelay: number; rootSpanContext?: Context; + sttModel?: string; + sttProvider?: string; } +// TODO add ability to update stt/vad/interruption-detection export class AudioRecognition { private hooks: RecognitionHooks; private stt?: STTNode; private vad?: VAD; private turnDetector?: _TurnDetector; - private turnDetectionMode?: Exclude; + private turnDetectionMode?: TurnDetectionMode; private minEndpointingDelay: number; private maxEndpointingDelay: number; private lastLanguage?: string; private rootSpanContext?: Context; + private sttModel?: string; + private sttProvider?: string; private deferredInputStream: DeferredReadableStream; private logger = log(); @@ -96,6 +111,16 @@ export class AudioRecognition { private commitUserTurnTask?: Task; private vadTask?: Task; private sttTask?: Task; + private interruptionTask?: Task; + + // interruption detection + private interruptionDetection?: AdaptiveInterruptionDetector; + private inputStartedAt?: number; + private ignoreUserTranscriptUntil?: number; + private transcriptBuffer: SpeechEvent[]; + private isInterruptionEnabled: boolean; + private isAgentSpeaking: boolean; + private interruptionStreamChannel: StreamChannel; constructor(opts: AudioRecognitionOptions) { this.hooks = opts.recognitionHooks; @@ -107,12 +132,22 @@ export class AudioRecognition { this.maxEndpointingDelay = opts.maxEndpointingDelay; this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; + this.sttModel = opts.sttModel; + this.sttProvider = opts.sttProvider; this.deferredInputStream = new DeferredReadableStream(); - const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee(); + const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee(); + const [inputStream, sttInputStream] = teedInput.tee(); this.vadInputStream = vadInputStream; this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable); this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter(); + + this.interruptionDetection = opts.interruptionDetection; + this.transcriptBuffer = []; + this.isInterruptionEnabled = !!(opts.interruptionDetection || opts.vad); + this.isAgentSpeaking = false; + this.interruptionStreamChannel = createStreamChannel(); + this.interruptionStreamChannel.addStreamInput(inputStream); } /** @@ -135,6 +170,184 @@ export class AudioRecognition { this.sttTask.result.catch((err) => { this.logger.error(`Error running STT task: ${err}`); }); + + this.interruptionTask = Task.from(({ signal }) => + this.createInterruptionTask(this.interruptionDetection, signal), + ); + this.interruptionTask.result.catch((err) => { + this.logger.error(`Error running interruption task: ${err}`); + }); + } + + async stop() { + await this.sttTask?.cancelAndWait(); + await this.vadTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); + } + + async onStartOfAgentSpeech() { + this.isAgentSpeaking = true; + return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); + } + + async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (!this.isInterruptionEnabled) { + this.isAgentSpeaking = false; + return; + } + + const inputOpen = await this.trySendInterruptionSentinel( + InterruptionStreamSentinel.agentSpeechEnded(), + ); + if (!inputOpen) { + this.isAgentSpeaking = false; + return; + } + + if (this.isAgentSpeaking) { + if (this.ignoreUserTranscriptUntil === undefined) { + this.onEndOfOverlapSpeech(); + } + this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil + ? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil) + : ignoreUserTranscriptUntil; + + // flush held transcripts if possible + await this.flushHeldTranscripts(); + } + this.isAgentSpeaking = false; + } + + /** Start interruption inference when agent is speaking and overlap speech starts. */ + async onStartOfOverlapSpeech(speechDurationInS: number, userSpeakingSpan?: Span) { + if (this.isAgentSpeaking) { + this.trySendInterruptionSentinel( + InterruptionStreamSentinel.overlapSpeechStarted(speechDurationInS, userSpeakingSpan), + ); + } + } + + async onEndOfOverlapSpeech(userSpeakingSpan?: Span) { + if (!this.isInterruptionEnabled) { + return; + } + if (userSpeakingSpan && userSpeakingSpan.isRecording()) { + userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, 'false'); + } + + return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded()); + } + + /** + * Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp. + * If the event has no timestamps, we assume it is the same as the next valid event. + */ + private async flushHeldTranscripts() { + if ( + !this.isInterruptionEnabled || + this.ignoreUserTranscriptUntil === undefined || + this.transcriptBuffer.length === 0 + ) { + return; + } + + if (!this.inputStartedAt) { + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + return; + } + + let emitFromIndex: number | null = null; + let shouldFlush = false; + + for (let i = 0; i < this.transcriptBuffer.length; i++) { + const ev = this.transcriptBuffer[i]; + if (!ev || !ev.alternatives || ev.alternatives.length === 0) { + emitFromIndex = Math.min(emitFromIndex ?? i, i); + continue; + } + const firstAlternative = ev.alternatives[0]; + if ( + firstAlternative.startTime === firstAlternative.endTime && + firstAlternative.startTime === 0 + ) { + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + return; + } + + if ( + firstAlternative.endTime > 0 && + firstAlternative.endTime + this.inputStartedAt < this.ignoreUserTranscriptUntil + ) { + emitFromIndex = null; + } else { + emitFromIndex = Math.min(emitFromIndex ?? i, i); + shouldFlush = true; + break; + } + } + + const eventsToEmit = + emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : []; + + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + + for (const event of eventsToEmit) { + this.logger.trace( + { + event: event.type, + }, + 're-emitting held user transcript', + ); + this.onSTTEvent(event); + } + } + + private shouldHoldSttEvent(ev: SpeechEvent): boolean { + if (!this.isInterruptionEnabled) { + return false; + } + if (this.isAgentSpeaking) { + return true; + } + + if (this.ignoreUserTranscriptUntil === undefined) { + return false; + } + // sentinel events are always held until we have something concrete to release them + if (!ev.alternatives || ev.alternatives.length === 0) { + return true; + } + + const alternative = ev.alternatives[0]; + + if ( + this.inputStartedAt && + alternative.startTime !== alternative.endTime && + alternative.endTime > 0 && + alternative.endTime + this.inputStartedAt < this.ignoreUserTranscriptUntil + ) { + return true; + } + return false; + } + + private async trySendInterruptionSentinel( + frame: AudioFrame | InterruptionSentinel, + ): Promise { + if (this.isInterruptionEnabled && !this.interruptionStreamChannel.closed) { + try { + await this.interruptionStreamChannel.write(frame); + return true; + } catch (e: unknown) { + this.logger.warn( + `could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`, + ); + } + } + return false; } private async onSTTEvent(ev: SpeechEvent) { @@ -159,6 +372,25 @@ export class AudioRecognition { return; } + // handle interruption detection + // - hold the event until the ignore_user_transcript_until expires + // - release only relevant events + // - allow RECOGNITION_USAGE to pass through immediately + + if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) { + if (this.shouldHoldSttEvent(ev)) { + this.logger.trace( + { event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil }, + 'holding STT event until ignore_user_transcript_until expires', + ); + this.transcriptBuffer.push(ev); + return; + } else { + await this.flushHeldTranscripts(); + // no return here to allow the new event to be processed normally + } + } + switch (ev.type) { case SpeechEventType.FINAL_TRANSCRIPT: this.hooks.onFinalTranscript(ev); @@ -329,6 +561,12 @@ export class AudioRecognition { } } + private onInterruptionEvent(ev: InterruptionEvent) { + if (ev.type === InterruptionEventType.INTERRUPTION) { + this.hooks.onInterruption(ev); + } + } + private runEOUDetection(chatCtx: ChatContext) { this.logger.debug( { @@ -572,6 +810,16 @@ export class AudioRecognition { context: this.rootSpanContext, startTime, }); + + if (this.sttModel) { + this.userTurnSpan.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, this.sttModel); + } + if (this.sttProvider) { + this.userTurnSpan.setAttribute( + traceTypes.ATTR_GEN_AI_PROVIDER_NAME, + this.sttProvider, + ); + } } // Capture sample rate from the first VAD event if not already set @@ -616,6 +864,70 @@ export class AudioRecognition { } } + private async createInterruptionTask( + interruptionDetection: AdaptiveInterruptionDetector | undefined, + signal: AbortSignal, + ) { + if (!interruptionDetection) return; + + const stream = interruptionDetection.createStream(); + const inputReader = this.interruptionStreamChannel.stream().getReader(); + + const cleanup = async () => { + try { + signal.removeEventListener('abort', abortHandler); + eventReader.releaseLock(); + await stream.close(); + } catch (e) { + this.logger.debug('createInterruptionTask: error during abort handler:', e); + } + }; + + // Forward input frames/sentinels to the interruption stream + const forwardTask = (async () => { + try { + const abortPromise = waitForAbort(signal); + while (!signal.aborted) { + const res = await Promise.race([inputReader.read(), abortPromise]); + if (!res) break; + const { value, done } = res; + if (done) break; + this.inputStartedAt = Date.now(); + await stream.pushFrame(value); + } + } finally { + inputReader.releaseLock(); + } + })(); + + // Read output events from the interruption stream + const eventReader = stream.stream().getReader(); + const abortHandler = async () => { + await cleanup(); + }; + signal.addEventListener('abort', abortHandler); + + try { + const abortPromise = waitForAbort(signal); + + while (!signal.aborted) { + const res = await Promise.race([eventReader.read(), abortPromise]); + if (!res) break; + const { done, value: ev } = res; + if (done) break; + this.onInterruptionEvent(ev); + } + } catch (e) { + if (!signal.aborted) { + this.logger.error(e, 'Error in interruption task'); + } + } finally { + await cleanup(); + await forwardTask; + this.logger.debug('Interruption task closed'); + } + } + setInputAudioStream(audioStream: ReadableStream) { this.deferredInputStream.setSource(audioStream); } @@ -688,6 +1000,8 @@ export class AudioRecognition { await this.sttTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); + await this.interruptionStreamChannel.close(); } private _endUserTurnSpan({ @@ -714,6 +1028,12 @@ export class AudioRecognition { } private get vadBaseTurnDetection() { - return ['vad', undefined].includes(this.turnDetectionMode); + if (typeof this.turnDetectionMode === 'object') { + return false; + } + + if (this.turnDetectionMode === undefined || this.turnDetectionMode === 'vad') { + return true; + } } } diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts index 7d8ff325f..b184ff85a 100644 --- a/agents/src/voice/events.ts +++ b/agents/src/voice/events.ts @@ -25,6 +25,8 @@ export enum AgentSessionEventTypes { FunctionToolsExecuted = 'function_tools_executed', MetricsCollected = 'metrics_collected', SpeechCreated = 'speech_created', + UserInterruptionDetected = 'user_interruption_detected', + UserNonInterruptionDetected = 'user_non_interruption_detected', Error = 'error', Close = 'close', } diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index 06867c43d..fa50af22f 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -36,6 +36,7 @@ export class _LLMGenerationData { generatedText: string = ''; generatedToolCalls: FunctionCall[]; id: string; + ttft?: number; constructor( public readonly textStream: ReadableStream, @@ -386,6 +387,8 @@ export function performLLMInference( toolCtx: ToolContext, modelSettings: ModelSettings, controller: AbortController, + model?: string, + provider?: string, ): [Task, _LLMGenerationData] { const textStream = new IdentityTransform(); const toolCallStream = new IdentityTransform(); @@ -401,8 +404,17 @@ export function performLLMInference( ); span.setAttribute(traceTypes.ATTR_FUNCTION_TOOLS, JSON.stringify(Object.keys(toolCtx))); + if (model) { + span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); + } + if (provider) { + span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); + } + let llmStreamReader: ReadableStreamDefaultReader | null = null; let llmStream: ReadableStream | null = null; + const startTime = performance.now() / 1000; // Convert to seconds + let firstTokenReceived = false; try { llmStream = await node(chatCtx, toolCtx, modelSettings); @@ -425,6 +437,11 @@ export function performLLMInference( const { done, value: chunk } = result; if (done) break; + if (!firstTokenReceived) { + firstTokenReceived = true; + data.ttft = performance.now() / 1000 - startTime; + } + if (typeof chunk === 'string') { data.generatedText += chunk; await textWriter.write(chunk); @@ -463,6 +480,9 @@ export function performLLMInference( } span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, data.generatedText); + if (data.ttft !== undefined) { + span.setAttribute(traceTypes.ATTR_RESPONSE_TTFT, data.ttft); + } } catch (error) { if (error instanceof DOMException && error.name === 'AbortError') { // Abort signal was triggered, handle gracefully @@ -497,14 +517,25 @@ export function performTTSInference( text: ReadableStream, modelSettings: ModelSettings, controller: AbortController, + model?: string, + provider?: string, ): [Task, ReadableStream] { const audioStream = new IdentityTransform(); const outputWriter = audioStream.writable.getWriter(); const audioOutputStream = audioStream.readable; - const _performTTSInferenceImpl = async (signal: AbortSignal) => { + const _performTTSInferenceImpl = async (signal: AbortSignal, span: Span) => { + if (model) { + span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); + } + if (provider) { + span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); + } + let ttsStreamReader: ReadableStreamDefaultReader | null = null; let ttsStream: ReadableStream | null = null; + const startTime = performance.now() / 1000; // Convert to seconds + let firstByteReceived = false; try { ttsStream = await node(text, modelSettings); @@ -522,6 +553,13 @@ export function performTTSInference( if (done) { break; } + + if (!firstByteReceived) { + firstByteReceived = true; + const ttfb = performance.now() / 1000 - startTime; + span.setAttribute(traceTypes.ATTR_RESPONSE_TTFB, ttfb); + } + await outputWriter.write(chunk); } } catch (error) { @@ -541,7 +579,7 @@ export function performTTSInference( const currentContext = otelContext.active(); const inferenceTask = async (signal: AbortSignal) => - tracer.startActiveSpan(async () => _performTTSInferenceImpl(signal), { + tracer.startActiveSpan(async (span) => _performTTSInferenceImpl(signal, span), { name: 'tts_node', context: currentContext, }); @@ -608,7 +646,6 @@ export function performTextForwarding( export interface _AudioOut { audio: Array; - /** Future that will be set with the timestamp of the first frame's capture */ firstFrameFut: Future; } @@ -696,7 +733,6 @@ export function performAudioForwarding( ]; } -// function_tool span is already implemented in tracableToolExecution below (line ~796) export function performToolExecutions({ session, speechHandle, diff --git a/agents/src/voice/report.ts b/agents/src/voice/report.ts index 49701a696..b18c1e795 100644 --- a/agents/src/voice/report.ts +++ b/agents/src/voice/report.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import type { ChatContext } from '../llm/chat_context.js'; +import { type ModelUsage, filterZeroValues } from '../metrics/model_usage.js'; import type { VoiceOptions } from './agent_session.js'; import type { AgentEvent } from './events.js'; @@ -23,6 +24,8 @@ export interface SessionReport { audioRecordingStartedAt?: number; /** Duration of the session in milliseconds */ duration?: number; + /** Usage summaries for the session, one per model/provider combination */ + modelUsage?: ModelUsage[]; } export interface SessionReportOptions { @@ -41,6 +44,8 @@ export interface SessionReportOptions { audioRecordingPath?: string; /** Timestamp when the audio recording started (milliseconds) */ audioRecordingStartedAt?: number; + /** Usage summaries for the session, one per model/provider combination */ + modelUsage?: ModelUsage[]; } export function createSessionReport(opts: SessionReportOptions): SessionReport { @@ -61,6 +66,7 @@ export function createSessionReport(opts: SessionReportOptions): SessionReport { audioRecordingStartedAt, duration: audioRecordingStartedAt !== undefined ? timestamp - audioRecordingStartedAt : undefined, + modelUsage: opts.modelUsage, }; } @@ -96,5 +102,6 @@ export function sessionReportToJSON(report: SessionReport): Record; + /** + * Configuration for interruption handling. + */ + interruption: Partial; +} + +export interface InternalTurnHandlingConfig extends TurnHandlingConfig { + endpointing: EndpointingConfig; + interruption: InterruptionConfig; +} + +export const defaultTurnHandlingConfig: InternalTurnHandlingConfig = { + turnDetection: undefined, + interruption: defaultInterruptionConfig, + endpointing: defaultEndpointingConfig, +}; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts new file mode 100644 index 000000000..1b0d1381c --- /dev/null +++ b/agents/src/voice/turn_config/utils.test.ts @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { beforeAll, describe, expect, it } from 'vitest'; +import { initializeLogger } from '../../log.js'; +import { defaultEndpointingConfig } from './endpointing.js'; +import { defaultInterruptionConfig } from './interruption.js'; +import { defaultTurnHandlingConfig } from './turn_handling.js'; +import { migrateLegacyOptions } from './utils.js'; + +beforeAll(() => { + initializeLogger({ pretty: true, level: 'info' }); +}); + +describe('migrateLegacyOptions', () => { + it('should return all defaults when no options are provided', () => { + const result = migrateLegacyOptions({}); + + expect(result.options.turnHandling).toEqual({ + turnDetection: defaultTurnHandlingConfig.turnDetection, + endpointing: defaultEndpointingConfig, + interruption: defaultInterruptionConfig, + }); + expect(result.options.maxToolSteps).toBe(3); + expect(result.options.preemptiveGeneration).toBe(false); + expect(result.options.userAwayTimeout).toBe(15.0); + }); + + it('should migrate legacy flat fields into nested turnHandling config', () => { + const result = migrateLegacyOptions({ + voiceOptions: { + minInterruptionDuration: 1.0, + minInterruptionWords: 3, + discardAudioIfUninterruptible: false, + minEndpointingDelay: 0.8, + maxEndpointingDelay: 5.0, + }, + }); + + expect(result.options.turnHandling.interruption!.minDuration).toBe(1.0); + expect(result.options.turnHandling.interruption!.minWords).toBe(3); + expect(result.options.turnHandling.interruption!.discardAudioIfUninterruptible).toBe(false); + expect(result.options.turnHandling.endpointing!.minDelay).toBe(0.8); + expect(result.options.turnHandling.endpointing!.maxDelay).toBe(5.0); + }); + + it('should set interruption.mode to false when allowInterruptions is false', () => { + const result = migrateLegacyOptions({ + options: { + allowInterruptions: false, + }, + }); + + expect(result.options.turnHandling.interruption!.mode).toBe(false); + }); + + it('should give options precedence over voiceOptions when both are provided', () => { + const result = migrateLegacyOptions({ + voiceOptions: { + minInterruptionDuration: 1.0, + maxEndpointingDelay: 5.0, + maxToolSteps: 10, + }, + options: { + minInterruptionDuration: 2.0, + maxEndpointingDelay: 8.0, + maxToolSteps: 5, + }, + }); + + expect(result.options.turnHandling.interruption!.minDuration).toBe(2.0); + expect(result.options.turnHandling.endpointing!.maxDelay).toBe(8.0); + expect(result.options.maxToolSteps).toBe(5); + }); + + it('should let explicit turnHandling override legacy flat fields', () => { + const result = migrateLegacyOptions({ + options: { + minInterruptionDuration: 1.0, + minEndpointingDelay: 0.8, + turnHandling: { + interruption: { minDuration: 3.0 }, + endpointing: { minDelay: 2.0 }, + }, + }, + }); + + expect(result.options.turnHandling.interruption!.minDuration).toBe(3.0); + expect(result.options.turnHandling.endpointing!.minDelay).toBe(2.0); + }); + + it('should preserve top-level turnDetection in the result', () => { + const result = migrateLegacyOptions({ + turnDetection: 'vad', + }); + + expect(result.turnDetection).toBe('vad'); + expect(result.options.turnHandling.turnDetection).toBe('vad'); + }); +}); diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts new file mode 100644 index 000000000..c8b8a0d27 --- /dev/null +++ b/agents/src/voice/turn_config/utils.ts @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { log } from '../../log.js'; +import { + type AgentSessionOptions, + type InternalSessionOptions, + defaultSessionOptions, +} from '../agent_session.js'; +import { defaultEndpointingConfig } from './endpointing.js'; +import { defaultInterruptionConfig } from './interruption.js'; +import { type TurnHandlingConfig, defaultTurnHandlingConfig } from './turn_handling.js'; + +export function migrateLegacyOptions( + legacyOptions: AgentSessionOptions, +): AgentSessionOptions & { options: InternalSessionOptions } { + const logger = log(); + const { voiceOptions, turnDetection, options: sessionOptions, ...rest } = legacyOptions; + + if (voiceOptions !== undefined && sessionOptions !== undefined) { + logger.warn( + 'Both voiceOptions and options have been supplied as part of the AgentSessionOptions, voiceOptions will be merged with options taking precedence', + ); + } + + // Preserve turnDetection before cloning since structuredClone converts class instances to plain objects + const originalTurnDetection = + sessionOptions?.turnHandling?.turnDetection ?? + voiceOptions?.turnHandling?.turnDetection ?? + turnDetection; + + const mergedOptions = structuredClone({ ...voiceOptions, ...sessionOptions }); + + const turnHandling: TurnHandlingConfig = { + interruption: { + discardAudioIfUninterruptible: mergedOptions?.discardAudioIfUninterruptible, + minDuration: mergedOptions?.minInterruptionDuration, + minWords: mergedOptions?.minInterruptionWords, + }, + endpointing: { + minDelay: mergedOptions?.minEndpointingDelay, + maxDelay: mergedOptions?.maxEndpointingDelay, + }, + + ...mergedOptions.turnHandling, + // Restore original turnDetection after spread to preserve class instance with methods + // (structuredClone converts class instances to plain objects, losing prototype methods) + turnDetection: originalTurnDetection, + } as const; + + if (mergedOptions?.allowInterruptions === false) { + turnHandling.interruption.mode = false; + } + + const optionsWithDefaults = { + ...defaultSessionOptions, + ...mergedOptions, + turnHandling: mergeWithDefaults(turnHandling), + }; + + const newAgentSessionOptions: AgentSessionOptions & { + options: InternalSessionOptions; + } = { + ...rest, + options: optionsWithDefaults, + voiceOptions: optionsWithDefaults, + turnDetection: turnHandling.turnDetection, + }; + + return newAgentSessionOptions; +} + +/** Remove keys whose value is `undefined` so they don't shadow defaults when spread. */ +export function stripUndefined(obj: T): Partial { + return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined)) as Partial; +} + +export function mergeWithDefaults(config: TurnHandlingConfig) { + return { + turnDetection: config.turnDetection ?? defaultTurnHandlingConfig.turnDetection, + endpointing: { ...defaultEndpointingConfig, ...stripUndefined(config.endpointing) }, + interruption: { ...defaultInterruptionConfig, ...stripUndefined(config.interruption) }, + } as const; +} diff --git a/examples/package.json b/examples/package.json index c9db0d91e..858259219 100644 --- a/examples/package.json +++ b/examples/package.json @@ -40,7 +40,6 @@ "@livekit/agents-plugin-silero": "workspace:*", "@livekit/agents-plugin-xai": "workspace:*", "@livekit/noise-cancellation-node": "^0.1.9", - "@livekit/plugins-ai-coustics": "0.1.7", "@livekit/rtc-node": "catalog:", "@opentelemetry/api": "^1.9.0", "@opentelemetry/api-logs": "^0.54.0", diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index 91e549bff..99e0c7591 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -8,13 +8,13 @@ import { cli, defineAgent, llm, + log, metrics, voice, } from '@livekit/agents'; import * as livekit from '@livekit/agents-plugin-livekit'; import * as silero from '@livekit/agents-plugin-silero'; -// import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; -import * as aic from '@livekit/plugins-ai-coustics'; +import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -39,6 +39,8 @@ export default defineAgent({ }, }); + const logger = log(); + const session = new voice.AgentSession({ // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand // See all available models at https://docs.livekit.io/agents/models/stt/ @@ -55,12 +57,20 @@ export default defineAgent({ // VAD and turn detection are used to determine when the user is speaking and when the agent should respond // See more at https://docs.livekit.io/agents/build/turns vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), + // to use realtime model, replace the stt, llm, tts and vad with the following // llm: new openai.realtime.RealtimeModel(), - voiceOptions: { + options: { // allow the LLM to generate a response while waiting for the end of turn preemptiveGeneration: true, + turnHandling: { + turnDetection: new livekit.turnDetector.MultilingualModel(), + interruption: { + resumeFalseInterruption: true, + falseInterruptionTimeout: 1, + mode: 'adaptive', + }, + }, }, connOptions: { // Example of overriding the default connection options for the LLM/TTS/STT @@ -79,13 +89,19 @@ export default defineAgent({ usageCollector.collect(ev.metrics); }); + session.on(voice.AgentSessionEventTypes.UserInterruptionDetected, (ev) => { + logger.warn({ type: ev.type }, 'interruption detected'); + }); + + session.on(voice.AgentSessionEventTypes.UserNonInterruptionDetected, (ev) => { + logger.warn({ type: ev.type }, 'non interruption detected'); + }); + await session.start({ agent, room: ctx.room, inputOptions: { - noiseCancellation: aic.audioEnhancement(), - // or for krisp use - // noiseCancellation: BackgroundVoiceCancellation(), + noiseCancellation: BackgroundVoiceCancellation(), }, }); diff --git a/plugins/google/src/beta/realtime/realtime_api.ts b/plugins/google/src/beta/realtime/realtime_api.ts index 83d8a5aa1..4b547cd3d 100644 --- a/plugins/google/src/beta/realtime/realtime_api.ts +++ b/plugins/google/src/beta/realtime/realtime_api.ts @@ -774,6 +774,8 @@ export class RealtimeSession extends llm.RealtimeSession { onmessage: (message: types.LiveServerMessage) => { this.onReceiveMessage(session, message); }, + // onerror is called for network-level errors (connection refused, DNS failure, TLS errors). + // Application-level errors (e.g., invalid model name) come through onclose with error codes. onerror: (error: ErrorEvent) => { this.#logger.error('Gemini Live session error:', error); if (!this.sessionShouldClose.isSet) { @@ -781,7 +783,15 @@ export class RealtimeSession extends llm.RealtimeSession { } }, onclose: (event: CloseEvent) => { - this.#logger.debug('Gemini Live session closed:', event.code, event.reason); + // Surface WebSocket close errors to the user instead of silently swallowing them + // Close code 1000 = normal closure, anything else is an error + if (event.code !== 1000) { + const errorMsg = event.reason || `WebSocket closed with code ${event.code}`; + this.#logger.error(`Gemini Live session error: ${errorMsg}`); + this.emitError(new Error(errorMsg), false); + } else { + this.#logger.debug('Gemini Live session closed:', event.code, event.reason); + } this.markCurrentGenerationDone(); }, }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b3ee8479d..a7b882c2c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -169,6 +169,9 @@ importers: livekit-server-sdk: specifier: ^2.14.1 version: 2.14.1 + ofetch: + specifier: ^1.5.1 + version: 1.5.1 openai: specifier: ^6.8.1 version: 6.8.1(ws@8.18.3)(zod@3.25.76) @@ -275,9 +278,6 @@ importers: '@livekit/noise-cancellation-node': specifier: ^0.1.9 version: 0.1.9 - '@livekit/plugins-ai-coustics': - specifier: 0.1.7 - version: 0.1.7(@livekit/rtc-node@0.13.24) '@livekit/rtc-node': specifier: 'catalog:' version: 0.13.24 @@ -1930,12 +1930,6 @@ packages: cpu: [x64] os: [win32] - '@livekit/plugins-ai-coustics@0.1.7': - resolution: {integrity: sha512-jScAdBttVdazsXvzK8v9lQdcBNZGCNM67kldtdpuXdGaT2X+aLqz4dTwRqnSSio99GfobGz/MMZ5H+3KLdy/9A==} - engines: {node: '>= 18'} - peerDependencies: - '@livekit/rtc-node': '*' - '@livekit/protocol@1.43.0': resolution: {integrity: sha512-WCJ97fa4CBqPDh8pzdszOm/2xmelJ3Dx2vjKBlyb9BzmPQx1LjzVciP6uYFFMCMdrq2l1mjFQBXEz8Z20UCkyw==} @@ -2644,9 +2638,6 @@ packages: '@types/tapable@1.0.6': resolution: {integrity: sha512-W+bw9ds02rAQaMvaLYxAbJ6cvguW/iJXNT6lTssS1ps6QdrMKttqEAMEG/b5CR8TZl3/L7/lH0ZV5nNR1LXikA==} - '@types/unzipper@0.10.11': - resolution: {integrity: sha512-D25im2zjyMCcgL9ag6N46+wbtJBnXIr7SI4zHf9eJD2Dw2tEB5e+p5MYkrxKIVRscs5QV0EhtU9rgXSPx90oJg==} - '@types/ws@8.5.10': resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==} @@ -2793,72 +2784,6 @@ packages: '@vitest/utils@4.0.17': resolution: {integrity: sha512-RG6iy+IzQpa9SB8HAFHJ9Y+pTzI+h8553MrciN9eC6TFBErqrQaTas4vG+MVj8S4uKk8uTT2p0vgZPnTdxd96w==} - '@yuuang/ffi-rs-android-arm64@1.3.1': - resolution: {integrity: sha512-V4nmlXdOYZEa7GOxSExVG95SLp8FE0iTq2yKeN54UlfNMr3Sik+1Ff57LcCv7qYcn4TBqnBAt5rT3FAM6T6caQ==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [android] - - '@yuuang/ffi-rs-darwin-arm64@1.3.1': - resolution: {integrity: sha512-YlnTMIyzfW3mAULC5ZA774nzQfFlYXM0rrfq/8ZzWt+IMbYk55a++jrI+6JeKV+1EqlDS3TFBEFtjdBNG94KzQ==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [darwin] - - '@yuuang/ffi-rs-darwin-x64@1.3.1': - resolution: {integrity: sha512-sI3LpQQ34SX4nyOHc5yxA7FSqs9qPEUMqW/y/wWo9cuyPpaHMFsi/BeOVYsnC0syp3FrY7gzn6RnD6PlXCktXg==} - engines: {node: '>= 12'} - cpu: [x64] - os: [darwin] - - '@yuuang/ffi-rs-linux-arm-gnueabihf@1.3.1': - resolution: {integrity: sha512-1WkcGkJTlwh4ZA59htKI+RXhiL3oKiYwLv7PO8LUf6FuADK73s5GcXp67iakKu243uYu+qGYr4RHco4ySddYhQ==} - engines: {node: '>= 12'} - cpu: [arm] - os: [linux] - - '@yuuang/ffi-rs-linux-arm64-gnu@1.3.1': - resolution: {integrity: sha512-J2PwqviycZxaEVA0Bwv38LqGDGSB9A1DPN4iYginYJZSvTvKW8kh7Tis0HbZrX1YDKnY8hi3lt0N0tCTNPDH5Q==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [linux] - - '@yuuang/ffi-rs-linux-arm64-musl@1.3.1': - resolution: {integrity: sha512-Hn1W1hBPssTaqikU1Bqp1XUdDdOgbnYVIOtR++LVx66hhrtjf/xrIUQOhTm+NmOFDG16JUKXe1skfM4gpaqYwg==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [linux] - - '@yuuang/ffi-rs-linux-x64-gnu@1.3.1': - resolution: {integrity: sha512-kW6e+oCYZPvpH2ppPsffA18e1aLowtmWTRjVlyHtY04g/nQDepQvDUkkcvInh9fW5jLna7PjHvktW1tVgYIj2A==} - engines: {node: '>= 12'} - cpu: [x64] - os: [linux] - - '@yuuang/ffi-rs-linux-x64-musl@1.3.1': - resolution: {integrity: sha512-HTwblAzruUS16nQPrez3ozvEHm1Xxh8J8w7rZYrpmAcNl1hzyOT8z/hY70M9Rt9fOqQ4Ovgor9qVy/U3ZJo0ZA==} - engines: {node: '>= 12'} - cpu: [x64] - os: [linux] - - '@yuuang/ffi-rs-win32-arm64-msvc@1.3.1': - resolution: {integrity: sha512-WeZkGl2BP1U4tRhEQH+FXLQS52N8obp74smK5AAGOfzPAT1pHkq6+dVkC1QCSIt7dHJs7SPtlnQw+5DkdZYlWA==} - engines: {node: '>= 12'} - cpu: [arm64] - os: [win32] - - '@yuuang/ffi-rs-win32-ia32-msvc@1.3.1': - resolution: {integrity: sha512-rNGgMeCH5mdeHiMiJgt7wWXovZ+FHEfXhU9p4zZBH4n8M1/QnEsRUwlapISPLpILSGpoYS6iBuq9/fUlZY8Mhg==} - engines: {node: '>= 12'} - cpu: [x64, ia32] - os: [win32] - - '@yuuang/ffi-rs-win32-x64-msvc@1.3.1': - resolution: {integrity: sha512-dr2LcLD2CXo2a7BktlOpV68QhayqiI112KxIJC9tBgQO/Dkdg4CPsdqmvzzLhFo64iC5RLl2BT7M5lJImrfUWw==} - engines: {node: '>= 12'} - cpu: [x64] - os: [win32] - abort-controller@3.0.0: resolution: {integrity: sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==} engines: {node: '>=6.5'} @@ -3019,19 +2944,9 @@ packages: resolution: {integrity: sha512-pbnl5XzGBdrFU/wT4jqmJVPn2B6UHPBOhzMQkY/SPUPB6QtUXtmBHBIwCbXJol93mOpGMnQyP/+BB19q04xj7g==} engines: {node: '>=4'} - big-integer@1.6.52: - resolution: {integrity: sha512-QxD8cf2eVqJOOz63z6JIN9BzvVs/dlySa5HGSBH5xtR8dPteIRQnBxxKqkNTiT6jbDTF6jAfrd4oMcND9RGbQg==} - engines: {node: '>=0.6'} - bignumber.js@9.3.1: resolution: {integrity: sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==} - binary@0.3.0: - resolution: {integrity: sha512-D4H1y5KYwpJgK8wk1Cue5LLPgmwHKYSChkbspQg5JtVuR5ulGckxfR62H3AE9UDkdMC8yyXlqYihuz3Aqg2XZg==} - - bluebird@3.4.7: - resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} - boolean@3.2.0: resolution: {integrity: sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==} deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. @@ -3053,17 +2968,9 @@ packages: buffer-equal-constant-time@1.0.1: resolution: {integrity: sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==} - buffer-indexof-polyfill@1.0.2: - resolution: {integrity: sha512-I7wzHwA3t1/lwXQh+A5PbNvJxgfo5r3xulgpYDB5zckTu/Z9oUK9biouBKQUjEqzaz3HnAT6TYoovmE+GqSf7A==} - engines: {node: '>=0.10'} - buffer@6.0.3: resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==} - buffers@0.1.1: - resolution: {integrity: sha512-9q/rDEGSb/Qsvv2qvzIzdluL5k7AaJOTrw23z9reQthrbF7is4CtlT0DXyO1oei2DCp4uojjzQ7igaSHp1kAEQ==} - engines: {node: '>=0.2.0'} - builtin-modules@3.3.0: resolution: {integrity: sha512-zhaCDicdLuWN5UbN5IMnFqNMhNfo919sH85y2/ea+5Yg9TsTkeZxpL+JLbp6cgYFS4sRLp3YV4S6yDuqVWHYOw==} engines: {node: '>=6'} @@ -3113,9 +3020,6 @@ packages: resolution: {integrity: sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==} engines: {node: '>=18'} - chainsaw@0.1.0: - resolution: {integrity: sha512-75kWfWt6MEKNC8xYXIdRpDehRYY/tNSgwKaJq+dbbDcxORuVrrQ+SEHoWsniVn9XPYfP4gmdWIeDk/4YNp1rNQ==} - chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -3191,9 +3095,6 @@ packages: resolution: {integrity: sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==} engines: {node: ^14.18.0 || >=16.10.0} - core-util-is@1.0.3: - resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} - cross-spawn@7.0.3: resolution: {integrity: sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==} engines: {node: '>= 8'} @@ -3289,6 +3190,9 @@ packages: resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} engines: {node: '>=6'} + destr@2.0.5: + resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} + detect-indent@6.1.0: resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} engines: {node: '>=8'} @@ -3332,9 +3236,6 @@ packages: resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==} engines: {node: '>= 0.4'} - duplexer2@0.1.4: - resolution: {integrity: sha512-asLFVfWWtJ90ZyOUHMqk7/S2w2guQKxUI2itj3d92ADHhxUSbCMGi1f1cBcJ7xM1To+pE/Khbwo1yuNbMEPKeA==} - eastasianwidth@0.2.0: resolution: {integrity: sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==} @@ -3701,9 +3602,6 @@ packages: resolution: {integrity: sha512-7yAQpD2UMJzLi1Dqv7qFYnPbaPx7ZfFK6PiIxQ4PfkGPyNyl2Ugx+a/umUonmKqjhM4DnfbMvdX6otXq83soQQ==} engines: {node: ^12.20 || >= 14.13} - ffi-rs@1.3.1: - resolution: {integrity: sha512-ZyNXL9fnclnZV+waQmWB9JrfbIEyxQa1OWtMrHOrAgcC04PgP5hBMG5TdhVN8N4uT/eul8zCFMVnJUukAFFlXA==} - file-entry-cache@6.0.1: resolution: {integrity: sha512-7Gps/XWymbLk2QLYK4NzpMOrYjMhdIxXuIvy2QBsLE6ljuodKvdkWs/cpyJJ3CVIVpH0Oi1Hvg1ovbMzLdFBBg==} engines: {node: ^10.12.0 || >=12.0.0} @@ -3770,11 +3668,6 @@ packages: engines: {node: ^8.16.0 || ^10.6.0 || >=11.0.0} os: [darwin] - fstream@1.0.12: - resolution: {integrity: sha512-WvJ193OHa0GHPEL+AycEJgxvBEwyfRkN1vhjca23OaPVMCaLCXTd5qAu82AjTcgP1UJmytkOKb63Ypde7raDIg==} - engines: {node: '>=0.6'} - deprecated: This package is no longer supported. - function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} @@ -3816,9 +3709,6 @@ packages: resolution: {integrity: sha512-g0QYk1dZBxGwk+Ngc+ltRH2IBp2f7zBkBMBJZCDerh6EhlhSR6+9irMCuT/09zD6qkarHUSn529sK/yL4S27mg==} engines: {node: '>= 0.4'} - get-symbol-from-current-process-h@1.0.2: - resolution: {integrity: sha512-syloC6fsCt62ELLrr1VKBM1ggOpMdetX9hTrdW77UQdcApPHLmf7CI7OKcN1c9kYuNxKcDe4iJ4FY9sX3aw2xw==} - get-tsconfig@4.7.5: resolution: {integrity: sha512-ZCuZCnlqNzjb4QprAzXKdpp/gh6KTxSJuw3IBsPnV/7fV4NxC9ckB+vPTt8w7fJA0TaSD7c55BR47JD6MEDyDw==} @@ -4115,9 +4005,6 @@ packages: resolution: {integrity: sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==} engines: {node: '>=0.10.0'} - isarray@1.0.0: - resolution: {integrity: sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==} - isarray@2.0.5: resolution: {integrity: sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==} @@ -4244,9 +4131,6 @@ packages: lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} - listenercount@1.0.1: - resolution: {integrity: sha512-3mk/Zag0+IJxeDrxSgaDPy4zZ3w05PRZeJNnlWhzFz5OkX49J4krc+A8X2d2M69vGMBEX0uyl8M+W+8gH+kBqQ==} - livekit-server-sdk@2.13.3: resolution: {integrity: sha512-ItSQ2gE1oz/Ev9mfBRdAw+P05rt/BaYRkldggKz0+3rh/Yt0ag0BLID3VrgCVFVRAQ2YEJKcJJyj5p4epIJ8QA==} engines: {node: '>=18'} @@ -4404,10 +4288,6 @@ packages: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} - mkdirp@0.5.6: - resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} - hasBin: true - mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -4445,14 +4325,14 @@ packages: natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} - node-addon-api@3.2.1: - resolution: {integrity: sha512-mmcei9JghVNDYydghQmeDX8KoAm0FAiYyIcUt/N4nhyAipB17pllZQDOJD2fotxABnt4Mdz+dKTO7eftLg4d0A==} - node-domexception@1.0.0: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} deprecated: Use your platform's native DOMException instead + node-fetch-native@1.6.7: + resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==} + node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -4466,10 +4346,6 @@ packages: resolution: {integrity: sha512-dRB78srN/l6gqWulah9SrxeYnxeddIG30+GOqK/9OlLVyLg3HPnr6SqOWTWOXKRwC2eGYCkZ59NNuSgvSrpgOA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} - node-gyp-build@4.8.4: - resolution: {integrity: sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==} - hasBin: true - npm-run-path@5.3.0: resolution: {integrity: sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -4512,6 +4388,9 @@ packages: obug@2.1.1: resolution: {integrity: sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==} + ofetch@1.5.1: + resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==} + on-exit-leak-free@2.1.2: resolution: {integrity: sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA==} engines: {node: '>=14.0.0'} @@ -4753,9 +4632,6 @@ packages: resolution: {integrity: sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} - process-nextick-args@2.0.1: - resolution: {integrity: sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==} - process-warning@3.0.0: resolution: {integrity: sha512-mqn0kFRl0EoqhnL0GQ0veqFHyIN1yig9RHh/InzORTUiZHFRAur+aMtRkELNwGs9aNwKS6tg/An4NYBPGwvtzQ==} @@ -4803,9 +4679,6 @@ packages: resolution: {integrity: sha512-VIMnQi/Z4HT2Fxuwg5KrY174U1VdUIASQVWXXyqtNRtxSr9IYkn1rsI6Tb6HsrHCmB7gVpNwX6JxPTHcH6IoTA==} engines: {node: '>=6'} - readable-stream@2.3.8: - resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==} - readable-stream@4.5.2: resolution: {integrity: sha512-yjavECdqeZ3GLXNgRXgeQEdz9fvDDkNKyHnbHRFtOr7/LcfgBcmct7t/ET+HaCTqfh06OzoAxrkN/IfjJBVe+g==} engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} @@ -4818,10 +4691,6 @@ packages: resolution: {integrity: sha512-57frrGM/OCTLqLOAh0mhVA9VBMHd+9U7Zb2THMGdBUoZVOtGbJzjxsYGDJ3A9AYYCP4hn6y1TVbaOfzWtm5GFg==} engines: {node: '>= 12.13.0'} - ref-napi@3.0.3: - resolution: {integrity: sha512-LiMq/XDGcgodTYOMppikEtJelWsKQERbLQsYm0IOOnzhwE9xYZC7x8txNnFC9wJNOkPferQI4vD4ZkC0mDyrOA==} - engines: {node: '>= 10.0'} - reflect.getprototypeof@1.0.6: resolution: {integrity: sha512-fmfw4XgoDke3kdI6h4xcUz1dG8uaiv5q9gcEwLS4Pnth2kxT+GZ7YehS1JTMGBQmtV7Y4GFGbs2re2NqhdozUg==} engines: {node: '>= 0.4'} @@ -4863,11 +4732,6 @@ packages: resolution: {integrity: sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} - rimraf@2.7.1: - resolution: {integrity: sha512-uWjbaKIK3T1OSVptzX7Nl6PvQ3qAGtKEtVRjRuazjfL3Bx5eI409VZSqgND+4UNnmzLVdPj9FqFJNPqBZFve4w==} - deprecated: Rimraf versions prior to v4 are no longer supported - hasBin: true - rimraf@3.0.2: resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} deprecated: Rimraf versions prior to v4 are no longer supported @@ -4903,9 +4767,6 @@ packages: resolution: {integrity: sha512-vj6RsCsWBCf19jIeHEfkRMw8DPiBb+DMXklQ/1SGDHOMlHdPUkZXFQ2YdplS23zESTijAcurb1aSgJA3AgMu1Q==} engines: {node: '>=0.4'} - safe-buffer@5.1.2: - resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==} - safe-buffer@5.2.1: resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==} @@ -4967,9 +4828,6 @@ packages: resolution: {integrity: sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==} engines: {node: '>= 0.4'} - setimmediate@1.0.5: - resolution: {integrity: sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==} - sharp@0.34.5: resolution: {integrity: sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} @@ -5082,9 +4940,6 @@ packages: resolution: {integrity: sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==} engines: {node: '>= 0.4'} - string_decoder@1.1.1: - resolution: {integrity: sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==} - string_decoder@1.3.0: resolution: {integrity: sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==} @@ -5231,9 +5086,6 @@ packages: tr46@1.0.1: resolution: {integrity: sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==} - traverse@0.3.9: - resolution: {integrity: sha512-iawgk0hLP3SxGKDfnDJf8wTz4p2qImnyihM5Hh/sGvQ3K37dPi/w8sRhdNIxYA1TwFwc5mDhIJq+O0RsvXBKdQ==} - tree-kill@1.2.2: resolution: {integrity: sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==} hasBin: true @@ -5385,29 +5237,22 @@ packages: ufo@1.5.3: resolution: {integrity: sha512-Y7HYmWaFwPUmkoQCUIAYpKqkOf+SbVj/2fJJZ4RJMCfZp0rTGwRbzQD+HghfnhKOjL9E01okqz+ncJskGYfBNw==} + ufo@1.6.3: + resolution: {integrity: sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q==} + unbox-primitive@1.0.2: resolution: {integrity: sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw==} undici-types@6.21.0: resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} - uniffi-bindgen-react-native@0.29.3-1: - resolution: {integrity: sha512-o6gXZsAh55yuvhwF2WSFdIHV4phyfWcCmg4DuyfJWJ7CvUz1UcIz2S4u9SmXAz1jsuqvu6Xc9hexrRBB0a5osg==} - hasBin: true - universalify@0.1.2: resolution: {integrity: sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==} engines: {node: '>= 4.0.0'} - unzipper@0.10.11: - resolution: {integrity: sha512-+BrAq2oFqWod5IESRjL3S8baohbevGcVA+teAIOYWM3pDVdseogqbzhhvvmiyQrUNKFUnDMtELW3X8ykbyDCJw==} - uri-js@4.4.1: resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==} - util-deprecate@1.0.2: - resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} - uuid@11.1.0: resolution: {integrity: sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==} hasBin: true @@ -6544,21 +6389,6 @@ snapshots: '@livekit/noise-cancellation-win32-x64@0.1.9': optional: true - '@livekit/plugins-ai-coustics@0.1.7(@livekit/rtc-node@0.13.24)': - dependencies: - '@livekit/rtc-node': 0.13.24 - '@types/unzipper': 0.10.11 - ffi-rs: 1.3.1 - node-fetch: 3.3.2 - pino: 9.6.0 - pino-pretty: 13.0.0 - ref-napi: 3.0.3 - tsx: 4.21.0 - uniffi-bindgen-react-native: 0.29.3-1 - unzipper: 0.10.11 - transitivePeerDependencies: - - supports-color - '@livekit/protocol@1.43.0': dependencies: '@bufbuild/protobuf': 1.10.1 @@ -7263,10 +7093,6 @@ snapshots: '@types/tapable@1.0.6': {} - '@types/unzipper@0.10.11': - dependencies: - '@types/node': 22.19.1 - '@types/ws@8.5.10': dependencies: '@types/node': 22.19.1 @@ -7482,39 +7308,6 @@ snapshots: '@vitest/pretty-format': 4.0.17 tinyrainbow: 3.0.3 - '@yuuang/ffi-rs-android-arm64@1.3.1': - optional: true - - '@yuuang/ffi-rs-darwin-arm64@1.3.1': - optional: true - - '@yuuang/ffi-rs-darwin-x64@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-arm-gnueabihf@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-arm64-gnu@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-arm64-musl@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-x64-gnu@1.3.1': - optional: true - - '@yuuang/ffi-rs-linux-x64-musl@1.3.1': - optional: true - - '@yuuang/ffi-rs-win32-arm64-msvc@1.3.1': - optional: true - - '@yuuang/ffi-rs-win32-ia32-msvc@1.3.1': - optional: true - - '@yuuang/ffi-rs-win32-x64-msvc@1.3.1': - optional: true - abort-controller@3.0.0: dependencies: event-target-shim: 5.0.1 @@ -7684,17 +7477,8 @@ snapshots: dependencies: is-windows: 1.0.2 - big-integer@1.6.52: {} - bignumber.js@9.3.1: {} - binary@0.3.0: - dependencies: - buffers: 0.1.1 - chainsaw: 0.1.0 - - bluebird@3.4.7: {} - boolean@3.2.0: {} brace-expansion@1.1.11: @@ -7716,15 +7500,11 @@ snapshots: buffer-equal-constant-time@1.0.1: {} - buffer-indexof-polyfill@1.0.2: {} - buffer@6.0.3: dependencies: base64-js: 1.5.1 ieee754: 1.2.1 - buffers@0.1.1: {} - builtin-modules@3.3.0: {} builtins@5.1.0: @@ -7782,10 +7562,6 @@ snapshots: chai@6.2.2: {} - chainsaw@0.1.0: - dependencies: - traverse: 0.3.9 - chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7846,8 +7622,6 @@ snapshots: consola@3.4.2: {} - core-util-is@1.0.3: {} - cross-spawn@7.0.3: dependencies: path-key: 3.1.1 @@ -7926,6 +7700,8 @@ snapshots: dequal@2.0.3: {} + destr@2.0.5: {} + detect-indent@6.1.0: {} detect-libc@2.1.2: {} @@ -7958,10 +7734,6 @@ snapshots: es-errors: 1.3.0 gopd: 1.2.0 - duplexer2@0.1.4: - dependencies: - readable-stream: 2.3.8 - eastasianwidth@0.2.0: {} ecdsa-sig-formatter@1.0.11: @@ -8543,20 +8315,6 @@ snapshots: node-domexception: 1.0.0 web-streams-polyfill: 3.3.3 - ffi-rs@1.3.1: - optionalDependencies: - '@yuuang/ffi-rs-android-arm64': 1.3.1 - '@yuuang/ffi-rs-darwin-arm64': 1.3.1 - '@yuuang/ffi-rs-darwin-x64': 1.3.1 - '@yuuang/ffi-rs-linux-arm-gnueabihf': 1.3.1 - '@yuuang/ffi-rs-linux-arm64-gnu': 1.3.1 - '@yuuang/ffi-rs-linux-arm64-musl': 1.3.1 - '@yuuang/ffi-rs-linux-x64-gnu': 1.3.1 - '@yuuang/ffi-rs-linux-x64-musl': 1.3.1 - '@yuuang/ffi-rs-win32-arm64-msvc': 1.3.1 - '@yuuang/ffi-rs-win32-ia32-msvc': 1.3.1 - '@yuuang/ffi-rs-win32-x64-msvc': 1.3.1 - file-entry-cache@6.0.1: dependencies: flat-cache: 3.2.0 @@ -8632,13 +8390,6 @@ snapshots: fsevents@2.3.3: optional: true - fstream@1.0.12: - dependencies: - graceful-fs: 4.2.11 - inherits: 2.0.4 - mkdirp: 0.5.6 - rimraf: 2.7.1 - function-bind@1.1.2: {} function.prototype.name@1.1.6: @@ -8703,8 +8454,6 @@ snapshots: es-errors: 1.3.0 get-intrinsic: 1.2.4 - get-symbol-from-current-process-h@1.0.2: {} - get-tsconfig@4.7.5: dependencies: resolve-pkg-maps: 1.0.0 @@ -8995,8 +8744,6 @@ snapshots: is-windows@1.0.2: {} - isarray@1.0.0: {} - isarray@2.0.5: {} isexe@2.0.0: {} @@ -9122,8 +8869,6 @@ snapshots: lines-and-columns@1.2.4: {} - listenercount@1.0.1: {} - livekit-server-sdk@2.13.3: dependencies: '@bufbuild/protobuf': 1.10.1 @@ -9266,10 +9011,6 @@ snapshots: minipass: 7.1.2 rimraf: 5.0.10 - mkdirp@0.5.6: - dependencies: - minimist: 1.2.8 - mkdirp@3.0.1: {} mlly@1.7.0: @@ -9299,10 +9040,10 @@ snapshots: natural-compare@1.4.0: {} - node-addon-api@3.2.1: {} - node-domexception@1.0.0: {} + node-fetch-native@1.6.7: {} + node-fetch@2.7.0: dependencies: whatwg-url: 5.0.0 @@ -9313,8 +9054,6 @@ snapshots: fetch-blob: 3.2.0 formdata-polyfill: 4.0.10 - node-gyp-build@4.8.4: {} - npm-run-path@5.3.0: dependencies: path-key: 4.0.0 @@ -9365,6 +9104,12 @@ snapshots: obug@2.1.1: {} + ofetch@1.5.1: + dependencies: + destr: 2.0.5 + node-fetch-native: 1.6.7 + ufo: 1.6.3 + on-exit-leak-free@2.1.2: {} once@1.4.0: @@ -9615,8 +9360,6 @@ snapshots: ansi-styles: 5.2.0 react-is: 18.3.1 - process-nextick-args@2.0.1: {} - process-warning@3.0.0: {} process-warning@4.0.1: {} @@ -9670,16 +9413,6 @@ snapshots: pify: 4.0.1 strip-bom: 3.0.0 - readable-stream@2.3.8: - dependencies: - core-util-is: 1.0.3 - inherits: 2.0.4 - isarray: 1.0.0 - process-nextick-args: 2.0.1 - safe-buffer: 5.1.2 - string_decoder: 1.1.1 - util-deprecate: 1.0.2 - readable-stream@4.5.2: dependencies: abort-controller: 3.0.0 @@ -9692,15 +9425,6 @@ snapshots: real-require@0.2.0: {} - ref-napi@3.0.3: - dependencies: - debug: 4.4.1 - get-symbol-from-current-process-h: 1.0.2 - node-addon-api: 3.2.1 - node-gyp-build: 4.8.4 - transitivePeerDependencies: - - supports-color - reflect.getprototypeof@1.0.6: dependencies: call-bind: 1.0.7 @@ -9753,10 +9477,6 @@ snapshots: reusify@1.0.4: {} - rimraf@2.7.1: - dependencies: - glob: 7.2.3 - rimraf@3.0.2: dependencies: glob: 7.2.3 @@ -9864,8 +9584,6 @@ snapshots: has-symbols: 1.0.3 isarray: 2.0.5 - safe-buffer@5.1.2: {} - safe-buffer@5.2.1: {} safe-regex-test@1.0.3: @@ -9918,8 +9636,6 @@ snapshots: functions-have-names: 1.2.3 has-property-descriptors: 1.0.2 - setimmediate@1.0.5: {} - sharp@0.34.5: dependencies: '@img/colour': 1.0.0 @@ -10066,10 +9782,6 @@ snapshots: define-properties: 1.2.1 es-object-atoms: 1.0.0 - string_decoder@1.1.1: - dependencies: - safe-buffer: 5.1.2 - string_decoder@1.3.0: dependencies: safe-buffer: 5.2.1 @@ -10201,8 +9913,6 @@ snapshots: dependencies: punycode: 2.3.1 - traverse@0.3.9: {} - tree-kill@1.2.2: {} true-case-path@2.2.1: {} @@ -10326,6 +10036,7 @@ snapshots: get-tsconfig: 4.7.5 optionalDependencies: fsevents: 2.3.3 + optional: true turbo-darwin-64@1.13.3: optional: true @@ -10414,6 +10125,8 @@ snapshots: ufo@1.5.3: {} + ufo@1.6.3: {} + unbox-primitive@1.0.2: dependencies: call-bind: 1.0.7 @@ -10423,29 +10136,12 @@ snapshots: undici-types@6.21.0: {} - uniffi-bindgen-react-native@0.29.3-1: {} - universalify@0.1.2: {} - unzipper@0.10.11: - dependencies: - big-integer: 1.6.52 - binary: 0.3.0 - bluebird: 3.4.7 - buffer-indexof-polyfill: 1.0.2 - duplexer2: 0.1.4 - fstream: 1.0.12 - graceful-fs: 4.2.11 - listenercount: 1.0.1 - readable-stream: 2.3.8 - setimmediate: 1.0.5 - uri-js@4.4.1: dependencies: punycode: 2.3.1 - util-deprecate@1.0.2: {} - uuid@11.1.0: {} validator@13.12.0: {}