From 9f67b4fabd8bedd28f7436a7ff7170c9371731d5 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Fri, 16 Jan 2026 15:28:08 +0100 Subject: [PATCH 01/25] wip http + ws transport for barge in --- agents/package.json | 1 + agents/src/utils/http_transport.ts | 61 ++++++ agents/src/utils/ws_transport.test.ts | 282 ++++++++++++++++++++++++++ agents/src/utils/ws_transport.ts | 22 ++ pnpm-lock.yaml | 27 +++ 5 files changed, 393 insertions(+) create mode 100644 agents/src/utils/http_transport.ts create mode 100644 agents/src/utils/ws_transport.test.ts create mode 100644 agents/src/utils/ws_transport.ts diff --git a/agents/package.json b/agents/package.json index 51d539b6f..62cfcb0f7 100644 --- a/agents/package.json +++ b/agents/package.json @@ -69,6 +69,7 @@ "heap-js": "^2.6.0", "json-schema": "^0.4.0", "livekit-server-sdk": "^2.14.1", + "ofetch": "^1.5.1", "openai": "^6.8.1", "pidusage": "^4.0.1", "pino": "^8.19.0", diff --git a/agents/src/utils/http_transport.ts b/agents/src/utils/http_transport.ts new file mode 100644 index 000000000..8a7750ec6 --- /dev/null +++ b/agents/src/utils/http_transport.ts @@ -0,0 +1,61 @@ +import { ofetch } from 'ofetch'; + +export interface PostOptions { + baseUrl: string; + token: string; + signal?: AbortSignal; + timeout?: number; +} + +export interface PredictOptions { + threshold: number; + minFrames: number; +} + +export interface PredictEndpointResponse { + created_at: number; + is_bargein: boolean; + probabilities: number[]; +} + +export interface PredictResponse { + createdAt: number; + isBargein: boolean; + probabilities: number[]; + predictionDuration: number; +} + +export async function predict( + data: Uint8Array, + predictOptions: PredictOptions, + options: PostOptions, +): Promise { + const createdAt = performance.now(); + const url = new URL(`/bargein`, options.baseUrl); + url.searchParams.append('threshold', predictOptions.threshold.toString()); + url.searchParams.append('min_frames', predictOptions.minFrames.toFixed()); + url.searchParams.append('created_at', createdAt.toFixed()); + + const { created_at, is_bargein, probabilities } = await ofetch( + url.toString(), + { + retry: 1, + retryDelay: 100, + headers: { + 'Content-Type': 'application/octet-stream', + Authorization: `Bearer ${options.token}`, + }, + signal: options.signal, + timeout: options.timeout, + method: 'POST', + body: data, + }, + ); + + return { + createdAt: created_at, + isBargein: is_bargein, + probabilities, + predictionDuration: (performance.now() - createdAt) / 1e9, + }; +} diff --git a/agents/src/utils/ws_transport.test.ts b/agents/src/utils/ws_transport.test.ts new file mode 100644 index 000000000..77c5fdc91 --- /dev/null +++ b/agents/src/utils/ws_transport.test.ts @@ -0,0 +1,282 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { WebSocket, WebSocketServer } from 'ws'; +import { webSocketStream } from './ws_transport.js'; + +describe('webSocketStream', () => { + describe('readable stream', () => { + it('receives messages from the WebSocket', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + wss.on('connection', (serverWs) => { + serverWs.send('hello'); + serverWs.send('world'); + serverWs.close(); + }); + + const { readable } = webSocketStream(`ws://localhost:${port}`); + const reader = readable.getReader(); + + const messages: string[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + messages.push(Buffer.from(value).toString()); + } + } finally { + reader.releaseLock(); + } + + expect(messages).toEqual(['hello', 'world']); + + wss.close(); + }); + + it('handles binary messages', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + const binaryData = new Uint8Array([1, 2, 3, 4, 5]); + + wss.on('connection', (serverWs) => { + serverWs.send(binaryData); + serverWs.close(); + }); + + const { readable } = webSocketStream(`ws://localhost:${port}`); + const reader = readable.getReader(); + + const chunks: Uint8Array[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(new Uint8Array(value)); + } + } finally { + reader.releaseLock(); + } + + expect(chunks).toHaveLength(1); + expect(Array.from(chunks[0]!)).toEqual([1, 2, 3, 4, 5]); + + wss.close(); + }); + + it('handles empty stream when connection closes immediately', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + wss.on('connection', (serverWs) => { + serverWs.close(); + }); + const { readable } = webSocketStream(`ws://localhost:${port}`); + const reader = readable.getReader(); + + const chunks: Uint8Array[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(value); + } + } finally { + reader.releaseLock(); + } + + expect(chunks).toEqual([]); + + wss.close(); + }); + }); + + describe('writable stream', () => { + it('sends messages through the WebSocket', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + const ws = new WebSocket(`ws://localhost:${port}`); + + const connected = new Promise((resolve) => { + ws.on('open', resolve); + }); + + const messagesReceived: string[] = []; + const serverClosed = new Promise((resolve) => { + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + messagesReceived.push(data.toString()); + }); + serverWs.on('close', resolve); + }); + }); + + await connected; + const { writable } = webSocketStream(`ws://localhost:${port}`); + const writer = writable.getWriter(); + + await writer.write(new TextEncoder().encode('hello')); + await writer.write(new TextEncoder().encode('world')); + await writer.close(); + + await serverClosed; + + expect(messagesReceived).toEqual(['hello', 'world']); + + wss.close(); + }); + + it('sends binary data through the WebSocket', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + const chunksReceived: Buffer[] = []; + const serverClosed = new Promise((resolve) => { + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + chunksReceived.push(Buffer.from(data as Buffer)); + }); + serverWs.on('close', resolve); + }); + }); + + const { writable } = webSocketStream(`ws://localhost:${port}`); + const writer = writable.getWriter(); + + const binaryData = new Uint8Array([10, 20, 30, 40, 50]); + await writer.write(binaryData); + await writer.close(); + + await serverClosed; + + expect(chunksReceived).toHaveLength(1); + expect(Array.from(chunksReceived[0]!)).toEqual([10, 20, 30, 40, 50]); + + wss.close(); + }); + + it('buffers writes if readyState is CONNECTING', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + const { writable } = webSocketStream(`ws://localhost:${port}`); + const writer = writable.getWriter(); + + const messagesReceived: string[] = []; + const serverClosed = new Promise((resolve) => { + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + messagesReceived.push(data.toString()); + }); + serverWs.on('close', resolve); + }); + }); + + // These writes should be buffered + await writer.write(new TextEncoder().encode('buffered message')); + await writer.close(); + + await serverClosed; + + expect(messagesReceived).toEqual(['buffered message']); + + wss.close(); + }); + }); + + describe('bidirectional communication', () => { + it('supports echo pattern with readable and writable', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + // Server echoes messages back + wss.on('connection', (serverWs) => { + serverWs.on('message', (data) => { + serverWs.send(data); + }); + }); + + const { readable, writable } = webSocketStream(`ws://localhost:${port}`); + const writer = writable.getWriter(); + const reader = readable.getReader(); + + // Send messages + await writer.write(new TextEncoder().encode('ping1')); + await writer.write(new TextEncoder().encode('ping2')); + + // Read echoed responses + const { value: response1 } = await reader.read(); + const { value: response2 } = await reader.read(); + + expect(Buffer.from(response1!).toString()).toBe('ping1'); + expect(Buffer.from(response2!).toString()).toBe('ping2'); + + reader.releaseLock(); + await writer.close(); + + wss.close(); + }); + }); + + describe('error handling', () => { + it('readable stream ends when WebSocket closes unexpectedly', async () => { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + + const port = (wss.address() as { port: number }).port; + + wss.on('connection', (serverWs) => { + serverWs.send('before close'); + // Terminate connection abruptly + serverWs.terminate(); + }); + + const { readable } = webSocketStream(`ws://localhost:${port}`); + const reader = readable.getReader(); + + const chunks: string[] = []; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(Buffer.from(value).toString()); + } + } catch (error) { + console.error(error); + // Connection terminated, stream may error + } finally { + reader.releaseLock(); + } + + // Should have received the message sent before termination + expect(chunks).toContain('before close'); + + wss.close(); + }); + }); +}); diff --git a/agents/src/utils/ws_transport.ts b/agents/src/utils/ws_transport.ts new file mode 100644 index 000000000..4af4f906b --- /dev/null +++ b/agents/src/utils/ws_transport.ts @@ -0,0 +1,22 @@ +import { Readable, Writable } from 'node:stream'; +import WebSocket, { createWebSocketStream } from 'ws'; + +export function webSocketStream(wsUrl: string) { + const ws = new WebSocket(wsUrl); + const duplex = createWebSocketStream(ws); + duplex.on('error', console.error); + + // End the write side when the read side ends to properly close the stream. + // This is needed because Readable.toWeb() waits for both sides of the duplex + // to close before signaling done on the ReadableStream. + duplex.on('end', () => { + duplex.end(); + }); + + // Convert the writable side + const writable = Writable.toWeb(duplex); + // Convert the readable side + const readable = Readable.toWeb(duplex); + + return { readable, writable, close: ws.close }; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1dce72646..6dc766f85 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -163,6 +163,9 @@ importers: livekit-server-sdk: specifier: ^2.14.1 version: 2.14.1 + ofetch: + specifier: ^1.5.1 + version: 1.5.1 openai: specifier: ^6.8.1 version: 6.8.1(ws@8.18.3)(zod@3.25.76) @@ -3145,6 +3148,9 @@ packages: resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} engines: {node: '>=6'} + destr@2.0.5: + resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} + detect-indent@6.1.0: resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} engines: {node: '>=8'} @@ -4285,6 +4291,9 @@ packages: engines: {node: '>=10.5.0'} deprecated: Use your platform's native DOMException instead + node-fetch-native@1.6.7: + resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==} + node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -4340,6 +4349,9 @@ packages: obug@2.1.1: resolution: {integrity: sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==} + ofetch@1.5.1: + resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==} + on-exit-leak-free@2.1.2: resolution: {integrity: sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA==} engines: {node: '>=14.0.0'} @@ -5179,6 +5191,9 @@ packages: ufo@1.5.3: resolution: {integrity: sha512-Y7HYmWaFwPUmkoQCUIAYpKqkOf+SbVj/2fJJZ4RJMCfZp0rTGwRbzQD+HghfnhKOjL9E01okqz+ncJskGYfBNw==} + ufo@1.6.3: + resolution: {integrity: sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q==} + unbox-primitive@1.0.2: resolution: {integrity: sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw==} @@ -7639,6 +7654,8 @@ snapshots: dequal@2.0.3: {} + destr@2.0.5: {} + detect-indent@6.1.0: {} detect-libc@2.0.4: {} @@ -8981,6 +8998,8 @@ snapshots: node-domexception@1.0.0: {} + node-fetch-native@1.6.7: {} + node-fetch@2.7.0: dependencies: whatwg-url: 5.0.0 @@ -9041,6 +9060,12 @@ snapshots: obug@2.1.1: {} + ofetch@1.5.1: + dependencies: + destr: 2.0.5 + node-fetch-native: 1.6.7 + ufo: 1.6.3 + on-exit-leak-free@2.1.2: {} once@1.4.0: @@ -10048,6 +10073,8 @@ snapshots: ufo@1.5.3: {} + ufo@1.6.3: {} + unbox-primitive@1.0.2: dependencies: call-bind: 1.0.7 From cf3d72347c4f069de50eac7b0418a1e325b074cb Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 13:51:01 +0100 Subject: [PATCH 02/25] refactor --- .../AdaptiveInterruptionDetector.ts | 119 ++++++ .../interruption/InterruptionStream.ts | 350 ++++++++++++++++++ agents/src/inference/interruption/defaults.ts | 33 ++ agents/src/inference/interruption/errors.ts | 0 .../interruption}/http_transport.ts | 4 +- .../inference/interruption/interruption.ts | 87 +++++ agents/src/inference/utils.test.ts | 31 ++ agents/src/inference/utils.ts | 15 + agents/src/stream/stream_channel.ts | 8 +- 9 files changed, 643 insertions(+), 4 deletions(-) create mode 100644 agents/src/inference/interruption/AdaptiveInterruptionDetector.ts create mode 100644 agents/src/inference/interruption/InterruptionStream.ts create mode 100644 agents/src/inference/interruption/defaults.ts create mode 100644 agents/src/inference/interruption/errors.ts rename agents/src/{utils => inference/interruption}/http_transport.ts (96%) create mode 100644 agents/src/inference/interruption/interruption.ts create mode 100644 agents/src/inference/utils.test.ts diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts new file mode 100644 index 000000000..a2181cdda --- /dev/null +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -0,0 +1,119 @@ +import type { TypedEventEmitter } from '@livekit/typed-emitter'; +import EventEmitter from 'events'; +import { + DEFAULT_BASE_URL, + FRAMES_PER_SECOND, + SAMPLE_RATE, + interruptionOptionDefaults, +} from './defaults.js'; +import type { InterruptionDetectionError } from './interruption.js'; + +type InterruptionCallbacks = { + interruptionDetected: () => void; + overlapSpeechDetected: () => void; + error: (error: InterruptionDetectionError) => void; +}; + +export interface InterruptionOptions { + sampleRate: number; + threshold: number; + minFrames: number; + maxAudioDuration: number; + audioPrefixDuration: number; + detectionInterval: number; + inferenceTimeout: number; + minInterruptionDuration: number; + baseUrl: string; + apiKey: string; + apiSecret: string; + useProxy: boolean; +} + +export type AdaptiveInterruptionDetectorOptions = Partial; + +export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { + options: InterruptionOptions; + private label: string; + private streams: WeakSet; // TODO: Union of InterruptionHttpStream | InterruptionWebSocketStream + + constructor(options: AdaptiveInterruptionDetectorOptions = {}) { + super(); + + const { + maxAudioDuration, + baseUrl, + apiKey, + apiSecret, + useProxy: useProxyArg, + audioPrefixDuration, + threshold, + detectionInterval, + inferenceTimeout, + minInterruptionDuration, + } = { ...interruptionOptionDefaults, ...options }; + + if (maxAudioDuration > 3.0) { + throw new Error('maxAudioDuration must be less than or equal to 3.0 seconds'); + } + + const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? DEFAULT_BASE_URL; + let lkApiKey = apiKey ?? ''; + let lkApiSecret = apiSecret ?? ''; + let useProxy: boolean; + + // use LiveKit credentials if using the default base URL (inference) + if (lkBaseUrl === DEFAULT_BASE_URL) { + lkApiKey = + apiKey ?? process.env.LIVEKIT_INFERENCE_API_KEY ?? process.env.LIVEKIT_API_KEY ?? ''; + if (!lkApiKey) { + throw new Error( + 'apiKey is required, either as argument or set LIVEKIT_API_KEY environmental variable', + ); + } + + lkApiSecret = + apiSecret ?? + process.env.LIVEKIT_INFERENCE_API_SECRET ?? + process.env.LIVEKIT_API_SECRET ?? + ''; + if (!lkApiSecret) { + throw new Error( + 'apiSecret is required, either as argument or set LIVEKIT_API_SECRET environmental variable', + ); + } + + useProxy = true; + } else { + useProxy = useProxyArg ?? false; + } + + this.options = { + sampleRate: SAMPLE_RATE, + threshold, + minFrames: Math.ceil(minInterruptionDuration * FRAMES_PER_SECOND), + maxAudioDuration, + audioPrefixDuration, + detectionInterval, + inferenceTimeout, + baseUrl: lkBaseUrl, + apiKey: lkApiKey, + apiSecret: lkApiSecret, + useProxy, + minInterruptionDuration, + }; + + this.label = `${this.constructor.name}`; + this.streams = new WeakSet(); + + console.info('adaptive interruption detector initialized', { + baseUrl: this.options.baseUrl, + detectionInterval: this.options.detectionInterval, + audioPrefixDuration: this.options.audioPrefixDuration, + maxAudioDuration: this.options.maxAudioDuration, + minFrames: this.options.minFrames, + threshold: this.options.threshold, + inferenceTimeout: this.options.inferenceTimeout, + useProxy: this.options.useProxy, + }); + } +} diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts new file mode 100644 index 000000000..c0f7136b9 --- /dev/null +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -0,0 +1,350 @@ +import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; +import type { Span } from '@opentelemetry/sdk-trace-base'; +import { type ReadableStream, TransformStream, WritableStream } from 'stream/web'; +import { log } from '../../log.js'; +import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { createAccessToken } from '../utils.js'; +import type { + AdaptiveInterruptionDetector, + InterruptionOptions, +} from './AdaptiveInterruptionDetector.js'; +import { apiConnectDefaults } from './defaults.js'; +import { predictHTTP } from './http_transport.js'; +import { + InterruptionCacheEntry, + type InterruptionDetectionError, + type InterruptionEvent, + InterruptionEventType, +} from './interruption.js'; + +export interface AgentSpeechStarted { + type: 'agent-speech-started'; +} + +export interface AgentSpeechEnded { + type: 'agent-speech-ended'; +} + +export interface OverlapSpeechStarted { + type: 'overlap-speech-started'; + speechDuration: number; + userSpeakingSpan: Span; +} + +export interface OverlapSpeechEnded { + type: 'overlap-speech-ended'; +} + +export interface Flush { + type: 'flush'; +} + +export type InterruptionSentinel = + | AgentSpeechStarted + | AgentSpeechEnded + | OverlapSpeechStarted + | OverlapSpeechEnded + | Flush; + +export class InterruptionStreamSentinel { + static speechStarted(): AgentSpeechEnded { + return { type: 'agent-speech-ended' }; + } + + static speechEnded(): AgentSpeechEnded { + return { type: 'agent-speech-ended' }; + } + + static overlapSpeechStarted( + speechDuration: number, + userSpeakingSpan: Span, + ): OverlapSpeechStarted { + return { type: 'overlap-speech-started', speechDuration, userSpeakingSpan }; + } + + static overlapSpeechEnded(): OverlapSpeechEnded { + return { type: 'overlap-speech-ended' }; + } + + static flush(): Flush { + return { type: 'flush' }; + } +} + +export interface ApiConnectOptions { + maxRetries: number; + retryInterval: number; + timeout: number; +} + +abstract class InterruptionStreamBase { + private inputStream: StreamChannel; + + private eventStream: StreamChannel; + + private resampler?: AudioResampler; + + private userSpeakingSpan: Span | undefined; + + private overlapSpeechStartedAt: number | undefined; + + private options: InterruptionOptions; + + private apiOptions: ApiConnectOptions; + + private model: AdaptiveInterruptionDetector; + + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { + this.inputStream = createStreamChannel< + InterruptionSentinel | AudioFrame, + InterruptionDetectionError + >(); + + this.eventStream = createStreamChannel(); + + this.model = model; + this.options = model.options; + this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; + } + + private setupTransform() { + let agentSpeechStarted = false; + let startIdx = 0; + let accumulatedSamples = 0; + let overlapSpeechStarted = false; + const cache = new Map(); // TODO limit cache size + const inferenceS16Data = new Int16Array( + Math.ceil(this.options.maxAudioDuration * this.options.sampleRate), + ).fill(0); + + const transformer = new TransformStream( + { + transform: (chunk, controller) => { + if (chunk instanceof AudioFrame) { + if (!agentSpeechStarted) { + return; + } + if (this.options.sampleRate !== chunk.sampleRate) { + controller.error('the sample rate of the input frames must be consistent'); + return; + } + const result = writeToInferenceS16Data( + chunk, + startIdx, + inferenceS16Data, + this.options.maxAudioDuration, + ); + startIdx = result.startIdx; + accumulatedSamples += result.samplesWritten; + + // Send data for inference when enough samples accumulated during overlap + if ( + accumulatedSamples >= + Math.floor(this.options.detectionInterval * this.options.sampleRate) && + overlapSpeechStarted + ) { + // Send a copy of the audio data up to startIdx for inference + const audioSlice = inferenceS16Data.slice(0, startIdx); + // TODO: send to data channel - dataChan.send(audioSlice); + accumulatedSamples = 0; + controller.enqueue(audioSlice); + } + } else if (chunk.type === 'agent-speech-started') { + log().debug('agent speech started'); + + agentSpeechStarted = true; + overlapSpeechStarted = false; + accumulatedSamples = 0; + startIdx = 0; + cache.clear(); + } else if (chunk.type === 'agent-speech-ended') { + log().debug('agent speech ended'); + + agentSpeechStarted = false; + overlapSpeechStarted = false; + accumulatedSamples = 0; + startIdx = 0; + cache.clear(); + } else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) { + this.userSpeakingSpan = chunk.userSpeakingSpan; + log().debug('overlap speech started, starting interruption inference'); + overlapSpeechStarted = true; + accumulatedSamples = 0; + const shiftSize = Math.min( + startIdx, + Math.round(chunk.speechDuration * this.options.sampleRate), + ); + // Shift the buffer: copy the last `shiftSize` samples before startIdx + // to the beginning of the buffer. This preserves recent audio context + // (the user's speech that occurred just before overlap was detected). + inferenceS16Data.copyWithin(0, startIdx - shiftSize, startIdx); + startIdx = shiftSize; + cache.clear(); + } else if (chunk.type === 'overlap-speech-ended') { + log().debug('overlap speech ended'); + + if (overlapSpeechStarted) { + this.userSpeakingSpan = undefined; + let latestEntry = Array.from(cache.values()).at(-1); + if (!latestEntry) { + log().debug('no request made for overlap speech'); + latestEntry = InterruptionCacheEntry.default(); + } else { + cache.delete(latestEntry.createdAt); + } + const event: InterruptionEvent = { + type: InterruptionEventType.OVERLAP_SPEECH_ENDED, + timestamp: Date.now(), + isInterruption: false, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + speechInput: latestEntry.speechInput, + probabilities: latestEntry.probabilities, + totalDuration: latestEntry.totalDuration, + detectionDelay: latestEntry.detectionDelay, + predictionDuration: latestEntry.predictionDuration, + probability: latestEntry.probability, + }; + this.eventStream.write(event); + } + } else if (chunk.type === 'flush') { + log().debug('flushing'); + // do nothing + } + }, + }, + { highWaterMark: Number.MAX_SAFE_INTEGER }, + { highWaterMark: Number.MAX_SAFE_INTEGER }, + ); + + const httpPostWriter = new WritableStream( + { + // Implement the sink + write: async (chunk) => { + if (this.overlapSpeechStartedAt) { + return; + } + await predictHTTP( + chunk, + { threshold: this.options.threshold, minFrames: this.options.minFrames }, + { + baseUrl: this.options.baseUrl, + timeout: this.options.inferenceTimeout, + token: await createAccessToken(), + }, + ); + }, + close() { + const listItem = document.createElement('li'); + listItem.textContent = `[MESSAGE RECEIVED] ${result}`; + list.appendChild(listItem); + }, + abort(err) { + console.log('Sink error:', err); + }, + }, + { highWaterMark: Number.MAX_SAFE_INTEGER }, + ); + + this.inputStream.stream().pipeThrough(transformer).pipeTo(httpPostWriter); + } + + private ensureInputNotEnded() { + if (this.inputStream.closed) { + throw new Error('input stream is closed'); + } + } + + private ensureStreamsNotEnded() { + this.ensureInputNotEnded(); + } + + private getResamplerFor(inputSampleRate: number): AudioResampler { + if (!this.resampler) { + this.resampler = new AudioResampler(inputSampleRate, this.options.sampleRate); + } + return this.resampler; + } + + get stream(): ReadableStream { + return this.eventStream.stream(); + } + + async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise { + this.ensureStreamsNotEnded(); + if (!(frame instanceof AudioFrame)) { + return this.inputStream.write(frame); + } else if (this.options.sampleRate !== frame.sampleRate) { + const resampler = this.getResamplerFor(frame.sampleRate); + if (resampler.inputRate !== frame.sampleRate) { + throw new Error('the sample rate of the input frames must be consistent'); + } + for (const resampledFrame of resampler.push(frame)) { + await this.inputStream.write(resampledFrame); + } + } else { + await this.inputStream.write(frame); + } + } + + async flush(): Promise { + this.ensureStreamsNotEnded(); + this.inputStream.write(InterruptionStreamSentinel.flush()); + } + + async endInput(): Promise { + await this.flush(); + await this.inputStream.close(); + } + + async close(): Promise { + if (!this.inputStream.closed) await this.inputStream.close(); + } +} + +/** + * Write the audio frame to the output data array and return the new start index + * and the number of samples written. + */ +function writeToInferenceS16Data( + frame: AudioFrame, + startIdx: number, + outData: Int16Array, + maxAudioDuration: number, +): { startIdx: number; samplesWritten: number } { + const maxWindowSize = Math.floor(maxAudioDuration * frame.sampleRate); + + if (frame.samplesPerChannel > outData.length) { + throw new Error('frame samples are greater than the max window size'); + } + + // Shift the data to the left if the window would overflow + const shift = startIdx + frame.samplesPerChannel - maxWindowSize; + if (shift > 0) { + outData.copyWithin(0, shift, startIdx); + startIdx -= shift; + } + + // Get the frame data as Int16Array + const frameData = new Int16Array( + frame.data.buffer, + frame.data.byteOffset, + frame.samplesPerChannel * frame.channels, + ); + + if (frame.channels > 1) { + // Mix down multiple channels to mono by averaging + for (let i = 0; i < frame.samplesPerChannel; i++) { + let sum = 0; + for (let ch = 0; ch < frame.channels; ch++) { + sum += frameData[i * frame.channels + ch] ?? 0; + } + outData[startIdx + i] = Math.floor(sum / frame.channels); + } + } else { + // Single channel - copy directly + outData.set(frameData, startIdx); + } + + startIdx += frame.samplesPerChannel; + return { startIdx, samplesWritten: frame.samplesPerChannel }; +} diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts new file mode 100644 index 000000000..e5e2ba6b3 --- /dev/null +++ b/agents/src/inference/interruption/defaults.ts @@ -0,0 +1,33 @@ +import type { InterruptionOptions } from './AdaptiveInterruptionDetector.js'; +import type { ApiConnectOptions } from './InterruptionStream.js'; + +export const MIN_INTERRUPTION_DURATION = 0.025 * 2; // 25ms per frame, 2 consecutive frames +export const THRESHOLD = 0.65; +export const MAX_AUDIO_DURATION = 3.0; +export const AUDIO_PREFIX_DURATION = 0.5; +export const DETECTION_INTERVAL = 0.1; +export const REMOTE_INFERENCE_TIMEOUT = 1.0; +export const SAMPLE_RATE = 16000; +export const FRAMES_PER_SECOND = 40; +export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; + +export const apiConnectDefaults: ApiConnectOptions = { + maxRetries: 3, + retryInterval: 2_000, + timeout: 10_000, +} as const; + +export const interruptionOptionDefaults: InterruptionOptions = { + sampleRate: SAMPLE_RATE, + threshold: THRESHOLD, + minFrames: Math.ceil(MIN_INTERRUPTION_DURATION * FRAMES_PER_SECOND), + maxAudioDuration: MAX_AUDIO_DURATION, + audioPrefixDuration: AUDIO_PREFIX_DURATION, + detectionInterval: DETECTION_INTERVAL, + inferenceTimeout: 10_000, + baseUrl: DEFAULT_BASE_URL, + apiKey: process.env.LIVEKIT_API_KEY || '', + apiSecret: process.env.LIVEKIT_API_SECRET || '', + useProxy: false, + minInterruptionDuration: MIN_INTERRUPTION_DURATION, +} as const; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts new file mode 100644 index 000000000..e69de29bb diff --git a/agents/src/utils/http_transport.ts b/agents/src/inference/interruption/http_transport.ts similarity index 96% rename from agents/src/utils/http_transport.ts rename to agents/src/inference/interruption/http_transport.ts index 8a7750ec6..dc2a9ddd4 100644 --- a/agents/src/utils/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -25,8 +25,8 @@ export interface PredictResponse { predictionDuration: number; } -export async function predict( - data: Uint8Array, +export async function predictHTTP( + data: Int16Array, predictOptions: PredictOptions, options: PostOptions, ): Promise { diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/interruption.ts new file mode 100644 index 000000000..f13767120 --- /dev/null +++ b/agents/src/inference/interruption/interruption.ts @@ -0,0 +1,87 @@ +import { slidingWindowMinMax } from '../utils.js'; + +export enum InterruptionEventType { + INTERRUPTION = 'interruption', + OVERLAP_SPEECH_ENDED = 'overlap_speech_ended', +} +export interface InterruptionEvent { + type: InterruptionEventType; + timestamp: number; + isInterruption: boolean; + totalDuration: number; + predictionDuration: number; + detectionDelay: number; + overlapSpeechStartedAt?: number; + speechInput?: Int16Array; + probabilities?: Float32Array; + probability: number; +} + +export class InterruptionDetectionError extends Error { + readonly type = 'InterruptionDetectionError'; + + readonly timestamp: number; + readonly label: string; + readonly recoverable: boolean; + + constructor(message: string, timestamp: number, label: string, recoverable: boolean) { + super(message); + this.name = 'InterruptionDetectionError'; + this.timestamp = timestamp; + this.label = label; + this.recoverable = recoverable; + } + + toString(): string { + return `${this.name}: ${this.message} (label=${this.label}, timestamp=${this.timestamp}, recoverable=${this.recoverable})`; + } +} + +function estimateProbability( + probabilities: Float32Array, + windowSize: number = MIN_INTERRUPTION_DURATION, +): number { + const minWindow = Math.ceil(windowSize / 0.025); // 25ms per frame + if (probabilities.length < minWindow) { + return 0; + } + + return slidingWindowMinMax(probabilities, windowSize); +} + +/** + * Typed cache entry for interruption inference results. + */ +export class InterruptionCacheEntry { + readonly createdAt: number; + readonly totalDuration: number; + readonly predictionDuration: number; + readonly detectionDelay: number; + readonly speechInput?: Int16Array; + readonly probabilities?: Float32Array; + readonly isInterruption?: boolean; + readonly probability: number; + + constructor(params: { + createdAt: number; + speechInput?: Int16Array; + totalDuration?: number; + predictionDuration?: number; + detectionDelay?: number; + probabilities?: Float32Array; + isInterruption?: boolean; + }) { + this.createdAt = params.createdAt; + this.totalDuration = params.totalDuration ?? 0; + this.predictionDuration = params.predictionDuration ?? 0; + this.detectionDelay = params.detectionDelay ?? 0; + this.speechInput = params.speechInput; + this.probabilities = params.probabilities; + this.isInterruption = params.isInterruption; + this.probability = this.probabilities ? estimateProbability(this.probabilities) : 0; + } + + static default(): InterruptionCacheEntry { + return new InterruptionCacheEntry({ createdAt: 0 }); + } +} diff --git a/agents/src/inference/utils.test.ts b/agents/src/inference/utils.test.ts new file mode 100644 index 000000000..bcd2fe9a8 --- /dev/null +++ b/agents/src/inference/utils.test.ts @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { slidingWindowMinMax } from './utils.js'; + +describe('slidingWindowMinMax', () => { + it('returns -Infinity when array is shorter than window size', () => { + expect(slidingWindowMinMax([0.5, 0.6], 3)).toBe(-Infinity); + expect(slidingWindowMinMax([], 1)).toBe(-Infinity); + }); + + it('returns the max value when window size is 1', () => { + // With window size 1, min of each window is the element itself, + // so max of mins is just the max of the array + expect(slidingWindowMinMax([0.1, 0.5, 0.3, 0.8, 0.2], 1)).toBe(0.8); + }); + + it('finds the best sustained probability across windows', () => { + // Windows of size 3: [0.2, 0.8, 0.7], [0.8, 0.7, 0.3], [0.7, 0.3, 0.9] + // Mins: 0.2, 0.3, 0.3 + // Max of mins: 0.3 + expect(slidingWindowMinMax([0.2, 0.8, 0.7, 0.3, 0.9], 3)).toBe(0.3); + }); + + it('returns the single element when array length equals window size', () => { + // Only one window covering the entire array, return min of that window + expect(slidingWindowMinMax([0.5, 0.9, 0.7], 3)).toBe(0.5); + expect(slidingWindowMinMax([0.8], 1)).toBe(0.8); + }); +}); diff --git a/agents/src/inference/utils.ts b/agents/src/inference/utils.ts index b3b772ef6..38c9faa5f 100644 --- a/agents/src/inference/utils.ts +++ b/agents/src/inference/utils.ts @@ -64,3 +64,18 @@ export async function connectWs( socket.once('close', onClose); }); } + +export function slidingWindowMinMax(probabilities: Float32Array, minWindow: number): number { + if (probabilities.length < minWindow) { + return -Infinity; + } + + let maxOfMins = -Infinity; + + for (let i = 0; i <= probabilities.length - minWindow; i++) { + const windowMin = Math.min(...probabilities.slice(i, i + minWindow)); + maxOfMins = Math.max(maxOfMins, windowMin); + } + + return maxOfMins; +} diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 1fb68bab2..546cf93ff 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -4,14 +4,15 @@ import type { ReadableStream } from 'node:stream/web'; import { IdentityTransform } from './identity_transform.js'; -export interface StreamChannel { +export interface StreamChannel { write(chunk: T): Promise; close(): Promise; stream(): ReadableStream; + abort(error: E): Promise; readonly closed: boolean; } -export function createStreamChannel(): StreamChannel { +export function createStreamChannel(): StreamChannel { const transform = new IdentityTransform(); const writer = transform.writable.getWriter(); let isClosed = false; @@ -19,6 +20,9 @@ export function createStreamChannel(): StreamChannel { return { write: (chunk: T) => writer.write(chunk), stream: () => transform.readable, + abort: (error: E) => { + return writer.abort(error); + }, close: async () => { try { const result = await writer.close(); From 738d1a5e7b918b6d0b97cd4c959d9c65abfbc07c Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 14:11:55 +0100 Subject: [PATCH 03/25] type errors resolved --- .../interruption/InterruptionStream.ts | 54 ++++++++++++++++--- .../inference/interruption/http_transport.ts | 4 +- .../inference/interruption/interruption.ts | 1 + agents/src/telemetry/trace_types.ts | 7 +++ 4 files changed, 57 insertions(+), 9 deletions(-) diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index c0f7136b9..1d9d9ca4f 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -1,5 +1,6 @@ import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import type { Span } from '@opentelemetry/sdk-trace-base'; +import { traceTypes } from 'agents/src/telemetry/index.js'; import { type ReadableStream, TransformStream, WritableStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; @@ -77,7 +78,18 @@ export interface ApiConnectOptions { timeout: number; } -abstract class InterruptionStreamBase { +function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { + span.setAttribute( + traceTypes.ATTR_IS_INTERRUPTION, + (entry.isInterruption ?? false).toString().toLowerCase(), + ); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PROBABILITY, entry.probability); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDuration); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDuration); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelay); +} + +export class InterruptionStreamBase { private inputStream: StreamChannel; private eventStream: StreamChannel; @@ -220,23 +232,51 @@ abstract class InterruptionStreamBase { { // Implement the sink write: async (chunk) => { - if (this.overlapSpeechStartedAt) { + if (!this.overlapSpeechStartedAt) { return; } - await predictHTTP( + const resp = await predictHTTP( chunk, { threshold: this.options.threshold, minFrames: this.options.minFrames }, { baseUrl: this.options.baseUrl, timeout: this.options.inferenceTimeout, - token: await createAccessToken(), + token: await createAccessToken(this.options.apiKey, this.options.apiSecret), }, ); + console.log('received inference response', resp); + const { createdAt, isBargein, probabilities, predictionDuration } = resp; + const entry = new InterruptionCacheEntry({ + createdAt, + probabilities, + isInterruption: isBargein, + speechInput: chunk, + totalDuration: (performance.now() - createdAt) / 1e9, + detectionDelay: Date.now() - this.overlapSpeechStartedAt, + predictionDuration, + }); + cache.set(createdAt, entry); + if (overlapSpeechStarted && entry.isInterruption) { + if (this.userSpeakingSpan) { + this.updateUserSpeakingSpan(this.userSpeakingSpan, entry); + } + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + isInterruption: entry.isInterruption, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + totalDuration: entry.totalDuration, + predictionDuration: entry.predictionDuration, + detectionDelay: entry.detectionDelay, + probability: entry.probability, + }; + this.eventStream.write(event); + } }, close() { - const listItem = document.createElement('li'); - listItem.textContent = `[MESSAGE RECEIVED] ${result}`; - list.appendChild(listItem); + console.log('closing http writer'); }, abort(err) { console.log('Sink error:', err); diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index dc2a9ddd4..c1f22a569 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -21,7 +21,7 @@ export interface PredictEndpointResponse { export interface PredictResponse { createdAt: number; isBargein: boolean; - probabilities: number[]; + probabilities: Float32Array; predictionDuration: number; } @@ -55,7 +55,7 @@ export async function predictHTTP( return { createdAt: created_at, isBargein: is_bargein, - probabilities, + probabilities: new Float32Array(probabilities), predictionDuration: (performance.now() - createdAt) / 1e9, }; } diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/interruption.ts index f13767120..e415f6d98 100644 --- a/agents/src/inference/interruption/interruption.ts +++ b/agents/src/inference/interruption/interruption.ts @@ -1,4 +1,5 @@ import { slidingWindowMinMax } from '../utils.js'; +import { MIN_INTERRUPTION_DURATION } from './defaults.js'; export enum InterruptionEventType { INTERRUPTION = 'interruption', diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index db76f7bc1..7220ec03a 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -51,6 +51,13 @@ export const ATTR_TRANSCRIPT_CONFIDENCE = 'lk.transcript_confidence'; export const ATTR_TRANSCRIPTION_DELAY = 'lk.transcription_delay'; export const ATTR_END_OF_TURN_DELAY = 'lk.end_of_turn_delay'; +// Adaptive Interruption attributes +export const ATTR_IS_INTERRUPTION = 'lk.is_interruption'; +export const ATTR_INTERRUPTION_PROBABILITY = 'lk.interruption.probability'; +export const ATTR_INTERRUPTION_TOTAL_DURATION = 'lk.interruption.total_duration'; +export const ATTR_INTERRUPTION_PREDICTION_DURATION = 'lk.interruption.prediction_duration'; +export const ATTR_INTERRUPTION_DETECTION_DELAY = 'lk.interruption.detection_delay'; + // metrics export const ATTR_LLM_METRICS = 'lk.llm_metrics'; export const ATTR_TTS_METRICS = 'lk.tts_metrics'; From b3638e9b5b99c1d866bb78b58e8f80828e9eb845 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 14:41:55 +0100 Subject: [PATCH 04/25] more wiring --- agents/src/index.ts | 2 ++ .../AdaptiveInterruptionDetector.ts | 25 ++++++++++++++++++- .../interruption/InterruptionStream.ts | 8 +++--- agents/src/inference/interruption/index.ts | 1 + 4 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 agents/src/inference/interruption/index.ts diff --git a/agents/src/index.ts b/agents/src/index.ts index 57ace0c7a..e4fd2859b 100644 --- a/agents/src/index.ts +++ b/agents/src/index.ts @@ -36,4 +36,6 @@ export * from './vad.js'; export * from './version.js'; export * from './worker.js'; +export * from './inference/interruption/index.js'; + export { cli, inference, ipc, llm, metrics, stream, stt, telemetry, tokenize, tts, voice }; diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index a2181cdda..69bf9b6d0 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -1,12 +1,18 @@ import type { TypedEventEmitter } from '@livekit/typed-emitter'; import EventEmitter from 'events'; +import { type ReadableStream, TransformStream } from 'stream/web'; +import { InterruptionStreamBase } from './InterruptionStream.js'; import { DEFAULT_BASE_URL, FRAMES_PER_SECOND, SAMPLE_RATE, interruptionOptionDefaults, } from './defaults.js'; -import type { InterruptionDetectionError } from './interruption.js'; +import { + type InterruptionDetectionError, + type InterruptionEvent, + InterruptionEventType, +} from './interruption.js'; type InterruptionCallbacks = { interruptionDetected: () => void; @@ -116,4 +122,21 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ useProxy: this.options.useProxy, }); } + + stream(): ReadableStream { + const httpStream = new InterruptionStreamBase(this, {}); + this.streams.add(httpStream); + const transformer = new TransformStream({ + transform: (chunk, controller) => { + if (chunk.type === InterruptionEventType.INTERRUPTION) { + this.emit('interruptionDetected'); // TODO payload + } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { + this.emit('overlapSpeechDetected'); // TODO payload + } + controller.enqueue(chunk); + }, + }); + const stream = httpStream.stream.pipeThrough(transformer); + return stream; + } } diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 1d9d9ca4f..ef4d4b682 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -48,8 +48,8 @@ export type InterruptionSentinel = | Flush; export class InterruptionStreamSentinel { - static speechStarted(): AgentSpeechEnded { - return { type: 'agent-speech-ended' }; + static speechStarted(): AgentSpeechStarted { + return { type: 'agent-speech-started' }; } static speechEnded(): AgentSpeechEnded { @@ -117,6 +117,8 @@ export class InterruptionStreamBase { this.model = model; this.options = model.options; this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; + + this.setupTransform(); } private setupTransform() { @@ -258,7 +260,7 @@ export class InterruptionStreamBase { cache.set(createdAt, entry); if (overlapSpeechStarted && entry.isInterruption) { if (this.userSpeakingSpan) { - this.updateUserSpeakingSpan(this.userSpeakingSpan, entry); + updateUserSpeakingSpan(this.userSpeakingSpan, entry); } const event: InterruptionEvent = { type: InterruptionEventType.INTERRUPTION, diff --git a/agents/src/inference/interruption/index.ts b/agents/src/inference/interruption/index.ts new file mode 100644 index 000000000..b8a4ed715 --- /dev/null +++ b/agents/src/inference/interruption/index.ts @@ -0,0 +1 @@ +export * from './AdaptiveInterruptionDetector.js'; From df7bb86163b619158883332007382ab9a9fc372c Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 14:45:54 +0100 Subject: [PATCH 05/25] exports and overlap handling --- agents/src/inference/interruption/InterruptionStream.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index ef4d4b682..2a90d91ed 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -314,6 +314,9 @@ export class InterruptionStreamBase { async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise { this.ensureStreamsNotEnded(); if (!(frame instanceof AudioFrame)) { + if (frame.type === 'overlap-speech-started') { + this.overlapSpeechStartedAt = Date.now() - frame.speechDuration; + } return this.inputStream.write(frame); } else if (this.options.sampleRate !== frame.sampleRate) { const resampler = this.getResamplerFor(frame.sampleRate); From 1f715c94ef3aa4faa6afab78e0d4432fe9fa53b4 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 14:48:06 +0100 Subject: [PATCH 06/25] thx claude --- .../interruption/AdaptiveInterruptionDetector.ts | 9 +++++++++ agents/src/inference/interruption/InterruptionStream.ts | 4 +++- agents/src/inference/interruption/index.ts | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index 69bf9b6d0..04c5741e4 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -139,4 +139,13 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ const stream = httpStream.stream.pipeThrough(transformer); return stream; } + + updateOptions(options: { threshold?: number; minInterruptionDuration?: number }): void { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + } + if (options.minInterruptionDuration !== undefined) { + this.options.minFrames = Math.ceil(options.minInterruptionDuration * FRAMES_PER_SECOND); + } + } } diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 2a90d91ed..d1f1defac 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -184,9 +184,11 @@ export class InterruptionStreamBase { log().debug('overlap speech started, starting interruption inference'); overlapSpeechStarted = true; accumulatedSamples = 0; + // Include both speech duration and audio prefix duration for context const shiftSize = Math.min( startIdx, - Math.round(chunk.speechDuration * this.options.sampleRate), + Math.round(chunk.speechDuration * this.options.sampleRate) + + Math.round(this.options.audioPrefixDuration * this.options.sampleRate), ); // Shift the buffer: copy the last `shiftSize` samples before startIdx // to the beginning of the buffer. This preserves recent audio context diff --git a/agents/src/inference/interruption/index.ts b/agents/src/inference/interruption/index.ts index b8a4ed715..0d0bc4c4a 100644 --- a/agents/src/inference/interruption/index.ts +++ b/agents/src/inference/interruption/index.ts @@ -1 +1,4 @@ export * from './AdaptiveInterruptionDetector.js'; +export * from './interruption.js'; +export { InterruptionStreamSentinel } from './InterruptionStream.js'; +export type { InterruptionSentinel } from './InterruptionStream.js'; From 049ca1726bbb6f3ed749c9b889dbd794effe7b4a Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 15:43:20 +0100 Subject: [PATCH 07/25] more wip --- .changeset/config.json | 7 - .../AdaptiveInterruptionDetector.ts | 19 +- .../interruption/InterruptionStream.ts | 4 +- agents/src/voice/agent_activity.ts | 61 +++++++ agents/src/voice/agent_session.ts | 24 ++- agents/src/voice/audio_recognition.ts | 162 +++++++++++++++++- examples/src/adaptive_interruption.ts | 106 ++++++++++++ 7 files changed, 369 insertions(+), 14 deletions(-) create mode 100644 examples/src/adaptive_interruption.ts diff --git a/.changeset/config.json b/.changeset/config.json index af66336b2..6e26590ab 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -8,13 +8,6 @@ ], "commit": false, "ignore": ["livekit-agents-examples"], - "fixed": [ - [ - "@livekit/agents", - "@livekit/agents-plugin-*", - "@livekit/agents-plugins-test" - ] - ], "access": "public", "baseBranch": "main", "updateInternalDependencies": "patch", diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index 04c5741e4..89c2a7b0b 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -123,9 +123,24 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ }); } + /** + * Creates a new InterruptionStreamBase for internal use. + * The stream can receive audio frames and sentinels via pushFrame(). + * Use this when you need direct access to the stream for pushing frames. + */ + createStream(): InterruptionStreamBase { + const stream = new InterruptionStreamBase(this, {}); + this.streams.add(stream); + return stream; + } + + /** + * Creates a new interruption stream and returns a ReadableStream of InterruptionEvents. + * This is a convenience method for consuming interruption events without needing + * to manage the underlying stream directly. + */ stream(): ReadableStream { - const httpStream = new InterruptionStreamBase(this, {}); - this.streams.add(httpStream); + const httpStream = this.createStream(); const transformer = new TransformStream({ transform: (chunk, controller) => { if (chunk.type === InterruptionEventType.INTERRUPTION) { diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index d1f1defac..fc1d5333e 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -1,6 +1,6 @@ import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; -import type { Span } from '@opentelemetry/sdk-trace-base'; -import { traceTypes } from 'agents/src/telemetry/index.js'; +import type { Span } from '@opentelemetry/api'; +import { traceTypes } from '../../telemetry/index.js'; import { type ReadableStream, TransformStream, WritableStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 3a0713329..3300f68f1 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -41,6 +41,8 @@ import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js import { splitWords } from '../tokenize/basic/word.js'; import { TTS, type TTSError } from '../tts/tts.js'; import { Future, Task, cancelAndWait, waitFor } from '../utils.js'; +import type { InterruptionEvent } from '../inference/interruption/interruption.js'; +import { InterruptionEventType } from '../inference/interruption/interruption.js'; import { VAD, type VADEvent } from '../vad.js'; import type { Agent, ModelSettings } from './agent.js'; import { StopResponse, asyncLocalStorage } from './agent.js'; @@ -112,6 +114,24 @@ export class AgentActivity implements RecognitionHooks { _mainTask?: Task; _userTurnCompletedTask?: Promise; + /** + * Notify that agent started speaking. + * This enables interruption detection in AudioRecognition. + * @internal + */ + notifyAgentSpeechStarted(): void { + this.audioRecognition?.onStartOfAgentSpeech(); + } + + /** + * Notify that agent stopped speaking. + * This disables interruption detection in AudioRecognition. + * @internal + */ + notifyAgentSpeechEnded(): void { + this.audioRecognition?.onEndOfAgentSpeech(); + } + constructor(agent: Agent, agentSession: AgentSession) { this.agent = agent; this.agentSession = agentSession; @@ -292,6 +312,7 @@ export class AgentActivity implements RecognitionHooks { // Disable stt node if stt is not provided stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined, vad: this.vad, + interruptionDetector: this.agentSession.interruptionDetector, turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, minEndpointingDelay: this.agentSession.options.minEndpointingDelay, @@ -697,6 +718,46 @@ export class AgentActivity implements RecognitionHooks { } } + onInterruption(ev: InterruptionEvent): void { + if (ev.type !== InterruptionEventType.INTERRUPTION) { + // Only handle actual interruptions, not overlap_speech_ended events + return; + } + + this.logger.info( + { + probability: ev.probability, + detectionDelay: ev.detectionDelay, + totalDuration: ev.totalDuration, + }, + 'adaptive interruption detected', + ); + + // Similar to onVADInferenceDone but triggered by the adaptive interruption detector + if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') { + return; + } + + if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) { + return; + } + + this.realtimeSession?.startUserActivity(); + + if ( + this._currentSpeech && + !this._currentSpeech.interrupted && + this._currentSpeech.allowInterruptions + ) { + this.logger.info( + { 'speech id': this._currentSpeech.id }, + 'speech interrupted by adaptive interruption detector', + ); + this.realtimeSession?.interrupt(); + this._currentSpeech.interrupt(); + } + } + onInterimTranscript(ev: SpeechEvent): void { if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) { // skip stt transcription if userTranscription is enabled on the realtime model diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index ad349a122..bb8325a80 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -15,6 +15,7 @@ import { type STTModelString, type TTSModelString, } from '../inference/index.js'; +import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; import { type JobContext, getJobContext } from '../job.js'; import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; @@ -106,6 +107,7 @@ export type AgentSessionOptions = { vad?: VAD; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; + interruptionDetector?: AdaptiveInterruptionDetector; userData?: UserData; voiceOptions?: Partial; connOptions?: SessionConnectOptions; @@ -167,6 +169,8 @@ export class AgentSession< /** @internal - Timestamp when the session started (milliseconds) */ _startedAt?: number; + interruptionDetector?: AdaptiveInterruptionDetector; + constructor(opts: AgentSessionOptions) { super(); @@ -176,6 +180,7 @@ export class AgentSession< llm, tts, turnDetection, + interruptionDetector, userData, voiceOptions = defaultVoiceOptions, connOptions, @@ -212,6 +217,7 @@ export class AgentSession< } this.turnDetection = turnDetection; + this.interruptionDetector = interruptionDetector; this._userData = userData; // configurable IO @@ -637,6 +643,8 @@ export class AgentSession< return; } + const oldState = this._agentState; + if (state === 'speaking') { // Reset error counts when agent starts speaking this.llmErrorCounts = 0; @@ -651,13 +659,25 @@ export class AgentSession< // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available // (Ref: Python agent_session.py line 1161-1164) } + + // Notify AudioRecognition that agent started speaking (for interruption detection) + this.activity?.notifyAgentSpeechStarted(); + } else if (oldState === 'speaking') { + // Agent stopped speaking + if (this.agentSpeakingSpan !== undefined) { + // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available + this.agentSpeakingSpan.end(); + this.agentSpeakingSpan = undefined; + } + + // Notify AudioRecognition that agent stopped speaking (for interruption detection) + this.activity?.notifyAgentSpeechEnded(); } else if (this.agentSpeakingSpan !== undefined) { - // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available + // Non-speaking to non-speaking transition but span is still open this.agentSpeakingSpan.end(); this.agentSpeakingSpan = undefined; } - const oldState = this._agentState; this._agentState = state; // Handle user away timer based on state changes diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 0382b1fd5..8206cffe7 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -5,6 +5,12 @@ import { AudioFrame } from '@livekit/rtc-node'; import type { Context, Span } from '@opentelemetry/api'; import type { WritableStreamDefaultWriter } from 'node:stream/web'; import { ReadableStream } from 'node:stream/web'; +import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; +import { + InterruptionStreamBase, + InterruptionStreamSentinel, +} from '../inference/interruption/InterruptionStream.js'; +import type { InterruptionEvent } from '../inference/interruption/interruption.js'; import { type ChatContext } from '../llm/chat_context.js'; import { log } from '../log.js'; import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js'; @@ -39,6 +45,7 @@ export interface RecognitionHooks { onFinalTranscript: (ev: SpeechEvent) => void; onEndOfTurn: (info: EndOfTurnInfo) => Promise; onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void; + onInterruption: (ev: InterruptionEvent) => void; retrieveChatCtx: () => ChatContext; } @@ -53,6 +60,7 @@ export interface AudioRecognitionOptions { recognitionHooks: RecognitionHooks; stt?: STTNode; vad?: VAD; + interruptionDetector?: AdaptiveInterruptionDetector; turnDetector?: _TurnDetector; turnDetectionMode?: Exclude; minEndpointingDelay: number; @@ -88,6 +96,7 @@ export class AudioRecognition { private vadInputStream: ReadableStream; private sttInputStream: ReadableStream; + private interruptionInputStream: ReadableStream; private silenceAudioTransform = new IdentityTransform(); private silenceAudioWriter: WritableStreamDefaultWriter; @@ -96,11 +105,19 @@ export class AudioRecognition { private commitUserTurnTask?: Task; private vadTask?: Task; private sttTask?: Task; + private interruptionTask?: Task; + + // interruption detection + private interruptionDetector?: AdaptiveInterruptionDetector; + private interruptionStream?: InterruptionStreamBase; + private interruptionEnabled = false; + private agentSpeaking = false; constructor(opts: AudioRecognitionOptions) { this.hooks = opts.recognitionHooks; this.stt = opts.stt; this.vad = opts.vad; + this.interruptionDetector = opts.interruptionDetector; this.turnDetector = opts.turnDetector; this.turnDetectionMode = opts.turnDetectionMode; this.minEndpointingDelay = opts.minEndpointingDelay; @@ -108,10 +125,15 @@ export class AudioRecognition { this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; + // Interruption detection is only enabled if both detector and VAD are provided + this.interruptionEnabled = this.interruptionDetector !== undefined && this.vad !== undefined; + this.deferredInputStream = new DeferredReadableStream(); - const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee(); + const [vadInputStream, rest] = this.deferredInputStream.stream.tee(); + const [sttInputStream, interruptionInputStream] = rest.tee(); this.vadInputStream = vadInputStream; this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable); + this.interruptionInputStream = interruptionInputStream; this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter(); } @@ -135,6 +157,15 @@ export class AudioRecognition { this.sttTask.result.catch((err) => { this.logger.error(`Error running STT task: ${err}`); }); + + if (this.interruptionEnabled && this.interruptionDetector) { + this.interruptionTask = Task.from(({ signal }) => + this.createInterruptionTask(this.interruptionDetector!, signal), + ); + this.interruptionTask.result.catch((err) => { + this.logger.error(`Error running interruption task: ${err}`); + }); + } } private async onSTTEvent(ev: SpeechEvent) { @@ -577,6 +608,11 @@ export class AudioRecognition { this.sampleRate = ev.frames[0].sampleRate; } + // If agent is speaking, user speech is overlap - trigger interruption detection + if (this.agentSpeaking && this.interruptionEnabled) { + this.onStartOfOverlapSpeech(ev.speechDuration, this.userTurnSpan); + } + this.bounceEOUTask?.cancel(); break; case VADEventType.INFERENCE_DONE: @@ -597,6 +633,11 @@ export class AudioRecognition { // when VAD fires END_OF_SPEECH, it already waited for the silence_duration this.speaking = false; + // If we were in overlap speech (agent speaking + user speaking), end it + if (this.agentSpeaking && this.interruptionEnabled) { + this.onEndOfOverlapSpeech(); + } + if ( this.vadBaseTurnDetection || (this.turnDetectionMode === 'stt' && this.userTurnCommitted) @@ -614,6 +655,123 @@ export class AudioRecognition { } } + private async createInterruptionTask( + interruptionDetector: AdaptiveInterruptionDetector, + signal: AbortSignal, + ) { + // Create the interruption stream from the detector + this.interruptionStream = interruptionDetector.createStream(); + + // Forward audio frames to the interruption stream + const reader = this.interruptionInputStream.getReader(); + + const forwardTask = (async () => { + try { + while (!signal.aborted) { + const { done, value: frame } = await reader.read(); + if (done) break; + await this.interruptionStream?.pushFrame(frame); + } + } catch (e) { + if (!signal.aborted) { + this.logger.error(e, 'Error forwarding audio to interruption stream'); + } + } finally { + reader.releaseLock(); + } + })(); + + // Read interruption events from the stream + const eventStream = this.interruptionStream.stream; + const eventReader = eventStream.getReader(); + + const abortHandler = () => { + eventReader.releaseLock(); + this.interruptionStream?.close(); + signal.removeEventListener('abort', abortHandler); + }; + signal.addEventListener('abort', abortHandler); + + try { + while (!signal.aborted) { + const { done, value: ev } = await eventReader.read(); + if (done) break; + + this.logger.debug({ type: ev.type, probability: ev.probability }, 'Interruption event'); + this.hooks.onInterruption(ev); + } + } catch (e) { + if (!signal.aborted) { + this.logger.error(e, 'Error in interruption task'); + } + } finally { + this.logger.debug('Interruption task closed'); + await forwardTask; + } + } + + /** + * Called when the agent starts speaking. + * Enables interruption detection by sending the agent-speech-started sentinel. + */ + onStartOfAgentSpeech(): void { + this.agentSpeaking = true; + + if (!this.interruptionEnabled || !this.interruptionStream) { + return; + } + + this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechStarted()); + } + + /** + * Called when the agent stops speaking. + * Disables interruption detection by sending the agent-speech-ended sentinel. + */ + onEndOfAgentSpeech(): void { + if (!this.interruptionEnabled || !this.interruptionStream) { + this.agentSpeaking = false; + return; + } + + this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechEnded()); + + if (this.agentSpeaking) { + // No interruption was detected, end the overlap inference (idempotent) + this.onEndOfOverlapSpeech(); + } + + this.agentSpeaking = false; + } + + /** + * Called when user starts speaking while agent is speaking (overlap speech). + * This triggers the interruption detection inference. + */ + onStartOfOverlapSpeech(speechDuration: number, userSpeakingSpan?: Span): void { + if (!this.interruptionEnabled || !this.interruptionStream) { + return; + } + + if (this.agentSpeaking && userSpeakingSpan) { + this.interruptionStream.pushFrame( + InterruptionStreamSentinel.overlapSpeechStarted(speechDuration, userSpeakingSpan), + ); + } + } + + /** + * Called when user stops speaking during overlap. + * This ends the interruption detection inference for this overlap period. + */ + onEndOfOverlapSpeech(): void { + if (!this.interruptionEnabled || !this.interruptionStream) { + return; + } + + this.interruptionStream.pushFrame(InterruptionStreamSentinel.overlapSpeechEnded()); + } + setInputAudioStream(audioStream: ReadableStream) { this.deferredInputStream.setSource(audioStream); } @@ -686,6 +844,8 @@ export class AudioRecognition { await this.sttTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); + await this.interruptionStream?.close(); } private _endUserTurnSpan({ diff --git a/examples/src/adaptive_interruption.ts b/examples/src/adaptive_interruption.ts new file mode 100644 index 000000000..4b02e688b --- /dev/null +++ b/examples/src/adaptive_interruption.ts @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +/** + * This example demonstrates how to use the AdaptiveInterruptionDetector + * for detecting user interruptions during agent speech. + * + * The detector analyzes overlapping speech (when user speaks while agent is speaking) + * and determines whether the user intends to interrupt or is just providing backchannel + * feedback (like "uh-huh", "okay", etc). + * + * The interruption detection is integrated into AudioRecognition and works automatically + * when the detector is provided along with VAD. It: + * 1. Forwards audio frames to the detector when the agent is speaking + * 2. Triggers overlap detection when VAD detects user speech during agent speech + * 3. Emits interruption events that can be handled to stop/pause agent speech + */ +import { + AdaptiveInterruptionDetector, + type JobContext, + type JobProcess, + WorkerOptions, + cli, + defineAgent, + log, + voice, +} from '@livekit/agents'; +import * as silero from '@livekit/agents-plugin-silero'; +import { fileURLToPath } from 'node:url'; + +export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const logger = log(); + const vad = ctx.proc.userData.vad as silero.VAD; + + await ctx.connect(); + + // Create the adaptive interruption detector with custom options + const interruptionDetector = new AdaptiveInterruptionDetector({ + // Threshold for interruption classification (0-1) + // Higher = less sensitive, lower = more sensitive + threshold: 0.65, + // Minimum duration of overlap speech to consider as potential interruption + minInterruptionDuration: 0.05, + // Maximum audio duration to analyze (including prefix) + maxAudioDuration: 3.0, + // Audio context to include before overlap started + audioPrefixDuration: 0.5, + // How often to run inference during overlap + detectionInterval: 0.1, + }); + + // Listen for interruption events on the detector (optional - for logging/metrics) + interruptionDetector.on('interruptionDetected', () => { + logger.info('Interruption detected via detector event'); + }); + + interruptionDetector.on('overlapSpeechDetected', () => { + logger.debug('Overlap speech ended without interruption (backchannel)'); + }); + + // Create the agent + const agent = new voice.Agent({ + instructions: `You are a helpful assistant that demonstrates interruption detection. + Speak naturally and respond to the user. When you are interrupted, + you will stop speaking and listen to the user.`, + }); + + // Create the session with interruption detection enabled + // The detector is passed to AgentSession which wires it through to AudioRecognition + const session = new voice.AgentSession({ + llm: 'openai/gpt-4.1-mini', + stt: 'deepgram/nova-3', + tts: 'cartesia/sonic-2:c45bc5ec-dc68-4feb-8829-6e6b2748095d', + vad, + // Pass the interruption detector + interruptionDetector, + }); + + // Start the session + await session.start({ + agent, + room: ctx.room, + }); + + // // Example: Dynamically adjust threshold based on context + // // This could be useful to adapt to different conversation styles + // setTimeout(() => { + // logger.info('Adjusting interruption threshold for more sensitive detection'); + // interruptionDetector.updateOptions({ + // threshold: 0.5, // More sensitive to interruptions + // minInterruptionDuration: 0.03, // Detect shorter interruptions + // }); + // }, 30000); + + session.say( + 'Hello! I can detect when you want to interrupt me versus when you are just saying things like uh-huh or okay. Try talking while I am speaking to see how it works!', + ); + }, +}); + +cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); From 0af5c0ace89376cbfc65e8a6b304632e453112d1 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Tue, 20 Jan 2026 15:44:07 +0100 Subject: [PATCH 08/25] changeset --- .changeset/shiny-eels-throw.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/shiny-eels-throw.md diff --git a/.changeset/shiny-eels-throw.md b/.changeset/shiny-eels-throw.md new file mode 100644 index 000000000..df3e21f67 --- /dev/null +++ b/.changeset/shiny-eels-throw.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents': patch +--- + +barge in From 094b1a0aeb52b5def043e6bf1e316ddf315ead41 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Wed, 21 Jan 2026 09:42:48 +0100 Subject: [PATCH 09/25] local testing --- agents/src/inference/interruption/defaults.ts | 2 +- examples/src/adaptive_interruption.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index e5e2ba6b3..2d6eeae3c 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -9,7 +9,7 @@ export const DETECTION_INTERVAL = 0.1; export const REMOTE_INFERENCE_TIMEOUT = 1.0; export const SAMPLE_RATE = 16000; export const FRAMES_PER_SECOND = 40; -export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; +export const DEFAULT_BASE_URL = 'http://localhost:8080'; export const apiConnectDefaults: ApiConnectOptions = { maxRetries: 3, diff --git a/examples/src/adaptive_interruption.ts b/examples/src/adaptive_interruption.ts index 4b02e688b..b0c6906fe 100644 --- a/examples/src/adaptive_interruption.ts +++ b/examples/src/adaptive_interruption.ts @@ -60,7 +60,7 @@ export default defineAgent({ }); interruptionDetector.on('overlapSpeechDetected', () => { - logger.debug('Overlap speech ended without interruption (backchannel)'); + logger.info('Overlap speech ended without interruption (backchannel)'); }); // Create the agent From 732d7b4465f9fa6ab818d543352c42a0f0acbe8d Mon Sep 17 00:00:00 2001 From: lukasIO Date: Wed, 21 Jan 2026 16:19:58 +0100 Subject: [PATCH 10/25] smaller bugfixes --- .../AdaptiveInterruptionDetector.ts | 42 ++++++++----------- .../interruption/InterruptionStream.ts | 16 +++---- .../inference/interruption/http_transport.ts | 6 +-- .../inference/interruption/interruption.ts | 10 ++--- agents/src/inference/utils.ts | 2 +- agents/src/voice/audio_recognition.ts | 7 ++-- examples/src/adaptive_interruption.ts | 3 ++ 7 files changed, 41 insertions(+), 45 deletions(-) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index 89c2a7b0b..589a3ff64 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -1,6 +1,7 @@ import type { TypedEventEmitter } from '@livekit/typed-emitter'; +import { log } from 'agents/src/log.js'; import EventEmitter from 'events'; -import { type ReadableStream, TransformStream } from 'stream/web'; +import { TransformStream } from 'stream/web'; import { InterruptionStreamBase } from './InterruptionStream.js'; import { DEFAULT_BASE_URL, @@ -129,30 +130,21 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ * Use this when you need direct access to the stream for pushing frames. */ createStream(): InterruptionStreamBase { - const stream = new InterruptionStreamBase(this, {}); - this.streams.add(stream); - return stream; - } - - /** - * Creates a new interruption stream and returns a ReadableStream of InterruptionEvents. - * This is a convenience method for consuming interruption events without needing - * to manage the underlying stream directly. - */ - stream(): ReadableStream { - const httpStream = this.createStream(); - const transformer = new TransformStream({ - transform: (chunk, controller) => { - if (chunk.type === InterruptionEventType.INTERRUPTION) { - this.emit('interruptionDetected'); // TODO payload - } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { - this.emit('overlapSpeechDetected'); // TODO payload - } - controller.enqueue(chunk); - }, - }); - const stream = httpStream.stream.pipeThrough(transformer); - return stream; + const streamBase = new InterruptionStreamBase(this, {}); + this.streams.add(streamBase); + // const transformer = new TransformStream({ + // transform: (chunk, controller) => { + // log().info('adaptive interruption detection stream transformer', chunk); + // if (chunk.type === InterruptionEventType.INTERRUPTION) { + // this.emit('interruptionDetected'); // TODO payload + // } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { + // this.emit('overlapSpeechDetected'); // TODO payload + // } + // controller.enqueue(chunk); + // }, + // }); + // streamBase.stream().pipeThrough(transformer); + return streamBase; } updateOptions(options: { threshold?: number; minInterruptionDuration?: number }): void { diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index fc1d5333e..fab9f9a97 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -1,9 +1,9 @@ import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import type { Span } from '@opentelemetry/api'; -import { traceTypes } from '../../telemetry/index.js'; import { type ReadableStream, TransformStream, WritableStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { traceTypes } from '../../telemetry/index.js'; import { createAccessToken } from '../utils.js'; import type { AdaptiveInterruptionDetector, @@ -164,7 +164,7 @@ export class InterruptionStreamBase { controller.enqueue(audioSlice); } } else if (chunk.type === 'agent-speech-started') { - log().debug('agent speech started'); + log().info('agent speech started'); agentSpeechStarted = true; overlapSpeechStarted = false; @@ -172,7 +172,7 @@ export class InterruptionStreamBase { startIdx = 0; cache.clear(); } else if (chunk.type === 'agent-speech-ended') { - log().debug('agent speech ended'); + log().info('agent speech ended'); agentSpeechStarted = false; overlapSpeechStarted = false; @@ -181,7 +181,7 @@ export class InterruptionStreamBase { cache.clear(); } else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) { this.userSpeakingSpan = chunk.userSpeakingSpan; - log().debug('overlap speech started, starting interruption inference'); + log().info('overlap speech started, starting interruption inference'); overlapSpeechStarted = true; accumulatedSamples = 0; // Include both speech duration and audio prefix duration for context @@ -197,7 +197,7 @@ export class InterruptionStreamBase { startIdx = shiftSize; cache.clear(); } else if (chunk.type === 'overlap-speech-ended') { - log().debug('overlap speech ended'); + log().info('overlap speech ended'); if (overlapSpeechStarted) { this.userSpeakingSpan = undefined; @@ -248,14 +248,13 @@ export class InterruptionStreamBase { token: await createAccessToken(this.options.apiKey, this.options.apiSecret), }, ); - console.log('received inference response', resp); const { createdAt, isBargein, probabilities, predictionDuration } = resp; const entry = new InterruptionCacheEntry({ createdAt, probabilities, isInterruption: isBargein, speechInput: chunk, - totalDuration: (performance.now() - createdAt) / 1e9, + totalDuration: (performance.now() - createdAt) / 1000, detectionDelay: Date.now() - this.overlapSpeechStartedAt, predictionDuration, }); @@ -276,6 +275,7 @@ export class InterruptionStreamBase { detectionDelay: entry.detectionDelay, probability: entry.probability, }; + log().info(`emitting interruption event: ${event.type}`); this.eventStream.write(event); } }, @@ -309,7 +309,7 @@ export class InterruptionStreamBase { return this.resampler; } - get stream(): ReadableStream { + stream(): ReadableStream { return this.eventStream.stream(); } diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index c1f22a569..fc131ed4f 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -21,7 +21,7 @@ export interface PredictEndpointResponse { export interface PredictResponse { createdAt: number; isBargein: boolean; - probabilities: Float32Array; + probabilities: number[]; predictionDuration: number; } @@ -55,7 +55,7 @@ export async function predictHTTP( return { createdAt: created_at, isBargein: is_bargein, - probabilities: new Float32Array(probabilities), - predictionDuration: (performance.now() - createdAt) / 1e9, + probabilities, + predictionDuration: (performance.now() - createdAt) / 1000, }; } diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/interruption.ts index e415f6d98..52783c895 100644 --- a/agents/src/inference/interruption/interruption.ts +++ b/agents/src/inference/interruption/interruption.ts @@ -14,7 +14,7 @@ export interface InterruptionEvent { detectionDelay: number; overlapSpeechStartedAt?: number; speechInput?: Int16Array; - probabilities?: Float32Array; + probabilities?: number[]; probability: number; } @@ -39,7 +39,7 @@ export class InterruptionDetectionError extends Error { } function estimateProbability( - probabilities: Float32Array, + probabilities: number[], windowSize: number = MIN_INTERRUPTION_DURATION, ): number { const minWindow = Math.ceil(windowSize / 0.025); // 25ms per frame @@ -47,7 +47,7 @@ function estimateProbability( return 0; } - return slidingWindowMinMax(probabilities, windowSize); + return slidingWindowMinMax(probabilities, minWindow); } /** @@ -59,7 +59,7 @@ export class InterruptionCacheEntry { readonly predictionDuration: number; readonly detectionDelay: number; readonly speechInput?: Int16Array; - readonly probabilities?: Float32Array; + readonly probabilities?: number[]; readonly isInterruption?: boolean; readonly probability: number; @@ -69,7 +69,7 @@ export class InterruptionCacheEntry { totalDuration?: number; predictionDuration?: number; detectionDelay?: number; - probabilities?: Float32Array; + probabilities?: number[]; isInterruption?: boolean; }) { this.createdAt = params.createdAt; diff --git a/agents/src/inference/utils.ts b/agents/src/inference/utils.ts index 38c9faa5f..e898d4de1 100644 --- a/agents/src/inference/utils.ts +++ b/agents/src/inference/utils.ts @@ -65,7 +65,7 @@ export async function connectWs( }); } -export function slidingWindowMinMax(probabilities: Float32Array, minWindow: number): number { +export function slidingWindowMinMax(probabilities: number[], minWindow: number): number { if (probabilities.length < minWindow) { return -Infinity; } diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 8206cffe7..24c109891 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -609,7 +609,8 @@ export class AudioRecognition { } // If agent is speaking, user speech is overlap - trigger interruption detection - if (this.agentSpeaking && this.interruptionEnabled) { + if (this.agentSpeaking) { + // TODO re-enable check for this.interruptionEnabled this.onStartOfOverlapSpeech(ev.speechDuration, this.userTurnSpan); } @@ -682,7 +683,7 @@ export class AudioRecognition { })(); // Read interruption events from the stream - const eventStream = this.interruptionStream.stream; + const eventStream = this.interruptionStream.stream(); const eventReader = eventStream.getReader(); const abortHandler = () => { @@ -697,7 +698,7 @@ export class AudioRecognition { const { done, value: ev } = await eventReader.read(); if (done) break; - this.logger.debug({ type: ev.type, probability: ev.probability }, 'Interruption event'); + this.logger.info({ type: ev.type, probability: ev.probability }, 'Interruption event'); this.hooks.onInterruption(ev); } } catch (e) { diff --git a/examples/src/adaptive_interruption.ts b/examples/src/adaptive_interruption.ts index b0c6906fe..6e6700f58 100644 --- a/examples/src/adaptive_interruption.ts +++ b/examples/src/adaptive_interruption.ts @@ -79,6 +79,9 @@ export default defineAgent({ vad, // Pass the interruption detector interruptionDetector, + voiceOptions: { + allowInterruptions: false, + }, }); // Start the session From d52d8afa1239dd588bd391d21b0272b9309bfe61 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Wed, 21 Jan 2026 17:38:14 +0100 Subject: [PATCH 11/25] more bug fixes and back pressure --- .../interruption/InterruptionStream.ts | 80 +++++++++++++------ 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index fab9f9a97..087a55abe 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -1,6 +1,6 @@ import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import type { Span } from '@opentelemetry/api'; -import { type ReadableStream, TransformStream, WritableStream } from 'stream/web'; +import { type ReadableStream, TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; import { traceTypes } from '../../telemetry/index.js'; @@ -92,7 +92,7 @@ function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { export class InterruptionStreamBase { private inputStream: StreamChannel; - private eventStream: StreamChannel; + private eventStream: ReadableStream; private resampler?: AudioResampler; @@ -112,16 +112,14 @@ export class InterruptionStreamBase { InterruptionDetectionError >(); - this.eventStream = createStreamChannel(); - this.model = model; this.options = model.options; this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; - this.setupTransform(); + this.eventStream = this.setupTransform(); } - private setupTransform() { + private setupTransform(): ReadableStream { let agentSpeechStarted = false; let startIdx = 0; let accumulatedSamples = 0; @@ -131,7 +129,11 @@ export class InterruptionStreamBase { Math.ceil(this.options.maxAudioDuration * this.options.sampleRate), ).fill(0); - const transformer = new TransformStream( + // First transform: process input frames/sentinels and output audio slices or events + const audioTransformer = new TransformStream< + InterruptionSentinel | AudioFrame, + Int16Array | InterruptionEvent + >( { transform: (chunk, controller) => { if (chunk instanceof AudioFrame) { @@ -159,8 +161,13 @@ export class InterruptionStreamBase { ) { // Send a copy of the audio data up to startIdx for inference const audioSlice = inferenceS16Data.slice(0, startIdx); - // TODO: send to data channel - dataChan.send(audioSlice); accumulatedSamples = 0; + const sinceOverlapStart = this.overlapSpeechStartedAt + ? Date.now() - this.overlapSpeechStartedAt + : 0; + log().info( + `audioTransformer: enqueuing audio slice for inference, ${sinceOverlapStart}ms since overlap start, ${audioSlice.length} samples`, + ); controller.enqueue(audioSlice); } } else if (chunk.type === 'agent-speech-started') { @@ -220,7 +227,8 @@ export class InterruptionStreamBase { predictionDuration: latestEntry.predictionDuration, probability: latestEntry.probability, }; - this.eventStream.write(event); + controller.enqueue(event); + overlapSpeechStarted = false; } } else if (chunk.type === 'flush') { log().debug('flushing'); @@ -228,17 +236,31 @@ export class InterruptionStreamBase { } }, }, - { highWaterMark: Number.MAX_SAFE_INTEGER }, - { highWaterMark: Number.MAX_SAFE_INTEGER }, + { highWaterMark: 32 }, + { highWaterMark: 32 }, ); - const httpPostWriter = new WritableStream( + // Second transform: HTTP transport - converts audio slices to events, passes through existing events + const httpTransport = new TransformStream( { - // Implement the sink - write: async (chunk) => { + transform: async (chunk, controller) => { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + log().info( + `httpTransport: passing through event type=${chunk.type}, detectionDelay=${chunk.detectionDelay}ms`, + ); + controller.enqueue(chunk); + return; + } + if (!this.overlapSpeechStartedAt) { return; } + const httpStartTime = Date.now(); + const sinceOverlapStart = httpStartTime - this.overlapSpeechStartedAt; + log().info( + `httpTransport: starting HTTP prediction, ${sinceOverlapStart}ms since overlap start`, + ); const resp = await predictHTTP( chunk, { threshold: this.options.threshold, minFrames: this.options.minFrames }, @@ -248,7 +270,11 @@ export class InterruptionStreamBase { token: await createAccessToken(this.options.apiKey, this.options.apiSecret), }, ); + const httpDuration = Date.now() - httpStartTime; const { createdAt, isBargein, probabilities, predictionDuration } = resp; + log().info( + `httpTransport: HTTP prediction completed in ${httpDuration}ms, isBargein=${isBargein}, predictionDuration=${predictionDuration}ms`, + ); const entry = new InterruptionCacheEntry({ createdAt, probabilities, @@ -275,21 +301,20 @@ export class InterruptionStreamBase { detectionDelay: entry.detectionDelay, probability: entry.probability, }; - log().info(`emitting interruption event: ${event.type}`); - this.eventStream.write(event); + log().info( + `httpTransport: emitting interruption event, detectionDelay=${entry.detectionDelay}ms, totalDuration=${(entry.totalDuration * 1000).toFixed(0)}ms`, + ); + overlapSpeechStarted = false; + controller.enqueue(event); } }, - close() { - console.log('closing http writer'); - }, - abort(err) { - console.log('Sink error:', err); - }, }, - { highWaterMark: Number.MAX_SAFE_INTEGER }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, ); - this.inputStream.stream().pipeThrough(transformer).pipeTo(httpPostWriter); + // Pipeline: input -> audioTransformer -> httpTransport -> eventStream + return this.inputStream.stream().pipeThrough(audioTransformer).pipeThrough(httpTransport); } private ensureInputNotEnded() { @@ -310,7 +335,7 @@ export class InterruptionStreamBase { } stream(): ReadableStream { - return this.eventStream.stream(); + return this.eventStream; } async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise { @@ -318,6 +343,11 @@ export class InterruptionStreamBase { if (!(frame instanceof AudioFrame)) { if (frame.type === 'overlap-speech-started') { this.overlapSpeechStartedAt = Date.now() - frame.speechDuration; + log().info( + `pushFrame: overlap-speech-started, speechDuration=${frame.speechDuration}ms, overlapSpeechStartedAt set to ${this.overlapSpeechStartedAt}`, + ); + } else { + log().info(`pushFrame: sentinel type=${frame.type}`); } return this.inputStream.write(frame); } else if (this.options.sampleRate !== frame.sampleRate) { From 4c4dbc87f1dc66ef8af791173b67f5e9f3a174a4 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Wed, 21 Jan 2026 19:25:45 +0100 Subject: [PATCH 12/25] better logging --- .../AdaptiveInterruptionDetector.ts | 35 +++++++------- .../interruption/InterruptionStream.ts | 46 +++++-------------- 2 files changed, 27 insertions(+), 54 deletions(-) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index 589a3ff64..133387156 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -1,7 +1,6 @@ import type { TypedEventEmitter } from '@livekit/typed-emitter'; -import { log } from 'agents/src/log.js'; import EventEmitter from 'events'; -import { TransformStream } from 'stream/web'; +import { log } from '../../log.js'; import { InterruptionStreamBase } from './InterruptionStream.js'; import { DEFAULT_BASE_URL, @@ -9,11 +8,7 @@ import { SAMPLE_RATE, interruptionOptionDefaults, } from './defaults.js'; -import { - type InterruptionDetectionError, - type InterruptionEvent, - InterruptionEventType, -} from './interruption.js'; +import { type InterruptionDetectionError } from './interruption.js'; type InterruptionCallbacks = { interruptionDetected: () => void; @@ -40,7 +35,7 @@ export type AdaptiveInterruptionDetectorOptions = Partial; export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { options: InterruptionOptions; - private label: string; + private logger = log(); private streams: WeakSet; // TODO: Union of InterruptionHttpStream | InterruptionWebSocketStream constructor(options: AdaptiveInterruptionDetectorOptions = {}) { @@ -109,19 +104,21 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ minInterruptionDuration, }; - this.label = `${this.constructor.name}`; this.streams = new WeakSet(); - console.info('adaptive interruption detector initialized', { - baseUrl: this.options.baseUrl, - detectionInterval: this.options.detectionInterval, - audioPrefixDuration: this.options.audioPrefixDuration, - maxAudioDuration: this.options.maxAudioDuration, - minFrames: this.options.minFrames, - threshold: this.options.threshold, - inferenceTimeout: this.options.inferenceTimeout, - useProxy: this.options.useProxy, - }); + this.logger.debug( + { + baseUrl: this.options.baseUrl, + detectionInterval: this.options.detectionInterval, + audioPrefixDuration: this.options.audioPrefixDuration, + maxAudioDuration: this.options.maxAudioDuration, + minFrames: this.options.minFrames, + threshold: this.options.threshold, + inferenceTimeout: this.options.inferenceTimeout, + useProxy: this.options.useProxy, + }, + 'adaptive interruption detector initialized', + ); } /** diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 087a55abe..76e2b6ec2 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -106,6 +106,8 @@ export class InterruptionStreamBase { private model: AdaptiveInterruptionDetector; + private logger = log(); + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { this.inputStream = createStreamChannel< InterruptionSentinel | AudioFrame, @@ -162,25 +164,17 @@ export class InterruptionStreamBase { // Send a copy of the audio data up to startIdx for inference const audioSlice = inferenceS16Data.slice(0, startIdx); accumulatedSamples = 0; - const sinceOverlapStart = this.overlapSpeechStartedAt - ? Date.now() - this.overlapSpeechStartedAt - : 0; - log().info( - `audioTransformer: enqueuing audio slice for inference, ${sinceOverlapStart}ms since overlap start, ${audioSlice.length} samples`, - ); controller.enqueue(audioSlice); } } else if (chunk.type === 'agent-speech-started') { - log().info('agent speech started'); - + this.logger.debug('agent speech started'); agentSpeechStarted = true; overlapSpeechStarted = false; accumulatedSamples = 0; startIdx = 0; cache.clear(); } else if (chunk.type === 'agent-speech-ended') { - log().info('agent speech ended'); - + this.logger.debug('agent speech ended'); agentSpeechStarted = false; overlapSpeechStarted = false; accumulatedSamples = 0; @@ -188,7 +182,7 @@ export class InterruptionStreamBase { cache.clear(); } else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) { this.userSpeakingSpan = chunk.userSpeakingSpan; - log().info('overlap speech started, starting interruption inference'); + this.logger.debug('overlap speech started, starting interruption inference'); overlapSpeechStarted = true; accumulatedSamples = 0; // Include both speech duration and audio prefix duration for context @@ -204,13 +198,12 @@ export class InterruptionStreamBase { startIdx = shiftSize; cache.clear(); } else if (chunk.type === 'overlap-speech-ended') { - log().info('overlap speech ended'); - + this.logger.debug('overlap speech ended'); if (overlapSpeechStarted) { this.userSpeakingSpan = undefined; let latestEntry = Array.from(cache.values()).at(-1); if (!latestEntry) { - log().debug('no request made for overlap speech'); + this.logger.debug('no request made for overlap speech'); latestEntry = InterruptionCacheEntry.default(); } else { cache.delete(latestEntry.createdAt); @@ -231,8 +224,7 @@ export class InterruptionStreamBase { overlapSpeechStarted = false; } } else if (chunk.type === 'flush') { - log().debug('flushing'); - // do nothing + // no-op } }, }, @@ -246,9 +238,6 @@ export class InterruptionStreamBase { transform: async (chunk, controller) => { // Pass through InterruptionEvents unchanged if (!(chunk instanceof Int16Array)) { - log().info( - `httpTransport: passing through event type=${chunk.type}, detectionDelay=${chunk.detectionDelay}ms`, - ); controller.enqueue(chunk); return; } @@ -256,11 +245,6 @@ export class InterruptionStreamBase { if (!this.overlapSpeechStartedAt) { return; } - const httpStartTime = Date.now(); - const sinceOverlapStart = httpStartTime - this.overlapSpeechStartedAt; - log().info( - `httpTransport: starting HTTP prediction, ${sinceOverlapStart}ms since overlap start`, - ); const resp = await predictHTTP( chunk, { threshold: this.options.threshold, minFrames: this.options.minFrames }, @@ -270,11 +254,7 @@ export class InterruptionStreamBase { token: await createAccessToken(this.options.apiKey, this.options.apiSecret), }, ); - const httpDuration = Date.now() - httpStartTime; const { createdAt, isBargein, probabilities, predictionDuration } = resp; - log().info( - `httpTransport: HTTP prediction completed in ${httpDuration}ms, isBargein=${isBargein}, predictionDuration=${predictionDuration}ms`, - ); const entry = new InterruptionCacheEntry({ createdAt, probabilities, @@ -301,8 +281,9 @@ export class InterruptionStreamBase { detectionDelay: entry.detectionDelay, probability: entry.probability, }; - log().info( - `httpTransport: emitting interruption event, detectionDelay=${entry.detectionDelay}ms, totalDuration=${(entry.totalDuration * 1000).toFixed(0)}ms`, + this.logger.debug( + { detectionDelay: entry.detectionDelay, totalDuration: entry.totalDuration }, + 'interruption detected', ); overlapSpeechStarted = false; controller.enqueue(event); @@ -343,11 +324,6 @@ export class InterruptionStreamBase { if (!(frame instanceof AudioFrame)) { if (frame.type === 'overlap-speech-started') { this.overlapSpeechStartedAt = Date.now() - frame.speechDuration; - log().info( - `pushFrame: overlap-speech-started, speechDuration=${frame.speechDuration}ms, overlapSpeechStartedAt set to ${this.overlapSpeechStartedAt}`, - ); - } else { - log().info(`pushFrame: sentinel type=${frame.type}`); } return this.inputStream.write(frame); } else if (this.options.sampleRate !== frame.sampleRate) { From c27f8dcd51da731ca85f9f9b5e0eb256847f0a71 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Wed, 21 Jan 2026 20:13:19 +0100 Subject: [PATCH 13/25] refactor and update naming --- .../AdaptiveInterruptionDetector.ts | 42 +-- .../interruption/InterruptionStream.ts | 123 +++---- agents/src/inference/interruption/defaults.ts | 21 +- .../inference/interruption/http_transport.ts | 118 +++++- .../inference/interruption/interruption.ts | 6 +- .../interruption}/ws_transport.test.ts | 0 .../inference/interruption/ws_transport.ts | 346 ++++++++++++++++++ agents/src/utils/ws_transport.ts | 22 -- 8 files changed, 542 insertions(+), 136 deletions(-) rename agents/src/{utils => inference/interruption}/ws_transport.test.ts (100%) create mode 100644 agents/src/inference/interruption/ws_transport.ts delete mode 100644 agents/src/utils/ws_transport.ts diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index 133387156..c20da246d 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -20,11 +20,11 @@ export interface InterruptionOptions { sampleRate: number; threshold: number; minFrames: number; - maxAudioDuration: number; - audioPrefixDuration: number; - detectionInterval: number; + maxAudioDurationInS: number; + audioPrefixDurationInS: number; + detectionIntervalInS: number; inferenceTimeout: number; - minInterruptionDuration: number; + minInterruptionDurationInS: number; baseUrl: string; apiKey: string; apiSecret: string; @@ -42,20 +42,20 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ super(); const { - maxAudioDuration, + maxAudioDurationInS, baseUrl, apiKey, apiSecret, useProxy: useProxyArg, - audioPrefixDuration, + audioPrefixDurationInS, threshold, - detectionInterval, + detectionIntervalInS, inferenceTimeout, - minInterruptionDuration, + minInterruptionDurationInS, } = { ...interruptionOptionDefaults, ...options }; - if (maxAudioDuration > 3.0) { - throw new Error('maxAudioDuration must be less than or equal to 3.0 seconds'); + if (maxAudioDurationInS > 3.0) { + throw new Error('maxAudioDurationInS must be less than or equal to 3.0 seconds'); } const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? DEFAULT_BASE_URL; @@ -92,16 +92,16 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ this.options = { sampleRate: SAMPLE_RATE, threshold, - minFrames: Math.ceil(minInterruptionDuration * FRAMES_PER_SECOND), - maxAudioDuration, - audioPrefixDuration, - detectionInterval, + minFrames: Math.ceil(minInterruptionDurationInS * FRAMES_PER_SECOND), + maxAudioDurationInS, + audioPrefixDurationInS, + detectionIntervalInS, inferenceTimeout, baseUrl: lkBaseUrl, apiKey: lkApiKey, apiSecret: lkApiSecret, useProxy, - minInterruptionDuration, + minInterruptionDurationInS, }; this.streams = new WeakSet(); @@ -109,9 +109,9 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ this.logger.debug( { baseUrl: this.options.baseUrl, - detectionInterval: this.options.detectionInterval, - audioPrefixDuration: this.options.audioPrefixDuration, - maxAudioDuration: this.options.maxAudioDuration, + detectionIntervalInS: this.options.detectionIntervalInS, + audioPrefixDurationInS: this.options.audioPrefixDurationInS, + maxAudioDurationInS: this.options.maxAudioDurationInS, minFrames: this.options.minFrames, threshold: this.options.threshold, inferenceTimeout: this.options.inferenceTimeout, @@ -144,12 +144,12 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ return streamBase; } - updateOptions(options: { threshold?: number; minInterruptionDuration?: number }): void { + updateOptions(options: { threshold?: number; minInterruptionDurationInS?: number }): void { if (options.threshold !== undefined) { this.options.threshold = options.threshold; } - if (options.minInterruptionDuration !== undefined) { - this.options.minFrames = Math.ceil(options.minInterruptionDuration * FRAMES_PER_SECOND); + if (options.minInterruptionDurationInS !== undefined) { + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); } } } diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 76e2b6ec2..7071ebb00 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -4,19 +4,19 @@ import { type ReadableStream, TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; import { traceTypes } from '../../telemetry/index.js'; -import { createAccessToken } from '../utils.js'; import type { AdaptiveInterruptionDetector, InterruptionOptions, } from './AdaptiveInterruptionDetector.js'; import { apiConnectDefaults } from './defaults.js'; -import { predictHTTP } from './http_transport.js'; +import { createHttpTransport } from './http_transport.js'; import { InterruptionCacheEntry, type InterruptionDetectionError, type InterruptionEvent, InterruptionEventType, } from './interruption.js'; +import { createWsTransport } from './ws_transport.js'; export interface AgentSpeechStarted { type: 'agent-speech-started'; @@ -28,7 +28,7 @@ export interface AgentSpeechEnded { export interface OverlapSpeechStarted { type: 'overlap-speech-started'; - speechDuration: number; + speechDurationInS: number; userSpeakingSpan: Span; } @@ -57,10 +57,10 @@ export class InterruptionStreamSentinel { } static overlapSpeechStarted( - speechDuration: number, + speechDurationInS: number, userSpeakingSpan: Span, ): OverlapSpeechStarted { - return { type: 'overlap-speech-started', speechDuration, userSpeakingSpan }; + return { type: 'overlap-speech-started', speechDurationInS, userSpeakingSpan }; } static overlapSpeechEnded(): OverlapSpeechEnded { @@ -126,11 +126,29 @@ export class InterruptionStreamBase { let startIdx = 0; let accumulatedSamples = 0; let overlapSpeechStarted = false; - const cache = new Map(); // TODO limit cache size + const cache = new Map(); const inferenceS16Data = new Int16Array( - Math.ceil(this.options.maxAudioDuration * this.options.sampleRate), + Math.ceil(this.options.maxAudioDurationInS * this.options.sampleRate), ).fill(0); + // State accessors for transport + const getState = () => ({ + overlapSpeechStarted, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + cache, + }); + const setState = (partial: { overlapSpeechStarted?: boolean }) => { + if (partial.overlapSpeechStarted !== undefined) { + overlapSpeechStarted = partial.overlapSpeechStarted; + } + }; + const handleSpanUpdate = (entry: InterruptionCacheEntry) => { + if (this.userSpeakingSpan) { + updateUserSpeakingSpan(this.userSpeakingSpan, entry); + this.userSpeakingSpan = undefined; + } + }; + // First transform: process input frames/sentinels and output audio slices or events const audioTransformer = new TransformStream< InterruptionSentinel | AudioFrame, @@ -150,7 +168,7 @@ export class InterruptionStreamBase { chunk, startIdx, inferenceS16Data, - this.options.maxAudioDuration, + this.options.maxAudioDurationInS, ); startIdx = result.startIdx; accumulatedSamples += result.samplesWritten; @@ -158,7 +176,7 @@ export class InterruptionStreamBase { // Send data for inference when enough samples accumulated during overlap if ( accumulatedSamples >= - Math.floor(this.options.detectionInterval * this.options.sampleRate) && + Math.floor(this.options.detectionIntervalInS * this.options.sampleRate) && overlapSpeechStarted ) { // Send a copy of the audio data up to startIdx for inference @@ -188,8 +206,8 @@ export class InterruptionStreamBase { // Include both speech duration and audio prefix duration for context const shiftSize = Math.min( startIdx, - Math.round(chunk.speechDuration * this.options.sampleRate) + - Math.round(this.options.audioPrefixDuration * this.options.sampleRate), + Math.round(chunk.speechDurationInS * this.options.sampleRate) + + Math.round(this.options.audioPrefixDurationInS * this.options.sampleRate), ); // Shift the buffer: copy the last `shiftSize` samples before startIdx // to the beginning of the buffer. This preserves recent audio context @@ -232,70 +250,23 @@ export class InterruptionStreamBase { { highWaterMark: 32 }, ); - // Second transform: HTTP transport - converts audio slices to events, passes through existing events - const httpTransport = new TransformStream( - { - transform: async (chunk, controller) => { - // Pass through InterruptionEvents unchanged - if (!(chunk instanceof Int16Array)) { - controller.enqueue(chunk); - return; - } - - if (!this.overlapSpeechStartedAt) { - return; - } - const resp = await predictHTTP( - chunk, - { threshold: this.options.threshold, minFrames: this.options.minFrames }, - { - baseUrl: this.options.baseUrl, - timeout: this.options.inferenceTimeout, - token: await createAccessToken(this.options.apiKey, this.options.apiSecret), - }, - ); - const { createdAt, isBargein, probabilities, predictionDuration } = resp; - const entry = new InterruptionCacheEntry({ - createdAt, - probabilities, - isInterruption: isBargein, - speechInput: chunk, - totalDuration: (performance.now() - createdAt) / 1000, - detectionDelay: Date.now() - this.overlapSpeechStartedAt, - predictionDuration, - }); - cache.set(createdAt, entry); - if (overlapSpeechStarted && entry.isInterruption) { - if (this.userSpeakingSpan) { - updateUserSpeakingSpan(this.userSpeakingSpan, entry); - } - const event: InterruptionEvent = { - type: InterruptionEventType.INTERRUPTION, - timestamp: Date.now(), - overlapSpeechStartedAt: this.overlapSpeechStartedAt, - isInterruption: entry.isInterruption, - speechInput: entry.speechInput, - probabilities: entry.probabilities, - totalDuration: entry.totalDuration, - predictionDuration: entry.predictionDuration, - detectionDelay: entry.detectionDelay, - probability: entry.probability, - }; - this.logger.debug( - { detectionDelay: entry.detectionDelay, totalDuration: entry.totalDuration }, - 'interruption detected', - ); - overlapSpeechStarted = false; - controller.enqueue(event); - } - }, - }, - { highWaterMark: 2 }, - { highWaterMark: 2 }, - ); - - // Pipeline: input -> audioTransformer -> httpTransport -> eventStream - return this.inputStream.stream().pipeThrough(audioTransformer).pipeThrough(httpTransport); + // Second transform: transport layer (HTTP or WebSocket based on useProxy) + const transportOptions = { + baseUrl: this.options.baseUrl, + apiKey: this.options.apiKey, + apiSecret: this.options.apiSecret, + sampleRate: this.options.sampleRate, + threshold: this.options.threshold, + minFrames: this.options.minFrames, + timeout: this.options.inferenceTimeout, + }; + + const transport = this.options.useProxy + ? createWsTransport(transportOptions, getState, setState, handleSpanUpdate) + : createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); + + // Pipeline: input -> audioTransformer -> transport -> eventStream + return this.inputStream.stream().pipeThrough(audioTransformer).pipeThrough(transport); } private ensureInputNotEnded() { @@ -323,7 +294,7 @@ export class InterruptionStreamBase { this.ensureStreamsNotEnded(); if (!(frame instanceof AudioFrame)) { if (frame.type === 'overlap-speech-started') { - this.overlapSpeechStartedAt = Date.now() - frame.speechDuration; + this.overlapSpeechStartedAt = Date.now() - frame.speechDurationInS * 1000; } return this.inputStream.write(frame); } else if (this.options.sampleRate !== frame.sampleRate) { diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 2d6eeae3c..60529b7d2 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -1,14 +1,15 @@ import type { InterruptionOptions } from './AdaptiveInterruptionDetector.js'; import type { ApiConnectOptions } from './InterruptionStream.js'; -export const MIN_INTERRUPTION_DURATION = 0.025 * 2; // 25ms per frame, 2 consecutive frames +export const MIN_INTERRUPTION_DURATION_IN_S = 0.025 * 2; // 25ms per frame, 2 consecutive frames export const THRESHOLD = 0.65; -export const MAX_AUDIO_DURATION = 3.0; -export const AUDIO_PREFIX_DURATION = 0.5; -export const DETECTION_INTERVAL = 0.1; -export const REMOTE_INFERENCE_TIMEOUT = 1.0; +export const MAX_AUDIO_DURATION_IN_S = 3.0; +export const AUDIO_PREFIX_DURATION_IN_S = 0.5; +export const DETECTION_INTERVAL_IN_S = 0.1; +export const REMOTE_INFERENCE_TIMEOUT_IN_S = 1.0; export const SAMPLE_RATE = 16000; export const FRAMES_PER_SECOND = 40; +export const FRAME_DURATION_IN_S = 0.025; // 25ms per frame export const DEFAULT_BASE_URL = 'http://localhost:8080'; export const apiConnectDefaults: ApiConnectOptions = { @@ -20,14 +21,14 @@ export const apiConnectDefaults: ApiConnectOptions = { export const interruptionOptionDefaults: InterruptionOptions = { sampleRate: SAMPLE_RATE, threshold: THRESHOLD, - minFrames: Math.ceil(MIN_INTERRUPTION_DURATION * FRAMES_PER_SECOND), - maxAudioDuration: MAX_AUDIO_DURATION, - audioPrefixDuration: AUDIO_PREFIX_DURATION, - detectionInterval: DETECTION_INTERVAL, + minFrames: Math.ceil(MIN_INTERRUPTION_DURATION_IN_S * FRAMES_PER_SECOND), + maxAudioDurationInS: MAX_AUDIO_DURATION_IN_S, + audioPrefixDurationInS: AUDIO_PREFIX_DURATION_IN_S, + detectionIntervalInS: DETECTION_INTERVAL_IN_S, inferenceTimeout: 10_000, baseUrl: DEFAULT_BASE_URL, apiKey: process.env.LIVEKIT_API_KEY || '', apiSecret: process.env.LIVEKIT_API_SECRET || '', useProxy: false, - minInterruptionDuration: MIN_INTERRUPTION_DURATION, + minInterruptionDurationInS: MIN_INTERRUPTION_DURATION_IN_S, } as const; diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index fc131ed4f..82f9726b6 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -1,4 +1,13 @@ import { ofetch } from 'ofetch'; +import { TransformStream } from 'stream/web'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import type { ApiConnectOptions } from './InterruptionStream.js'; +import { + InterruptionCacheEntry, + type InterruptionEvent, + InterruptionEventType, +} from './interruption.js'; export interface PostOptions { baseUrl: string; @@ -22,13 +31,14 @@ export interface PredictResponse { createdAt: number; isBargein: boolean; probabilities: number[]; - predictionDuration: number; + predictionDurationInS: number; } export async function predictHTTP( data: Int16Array, predictOptions: PredictOptions, options: PostOptions, + apiOptions: ApiConnectOptions, ): Promise { const createdAt = performance.now(); const url = new URL(`/bargein`, options.baseUrl); @@ -39,8 +49,11 @@ export async function predictHTTP( const { created_at, is_bargein, probabilities } = await ofetch( url.toString(), { - retry: 1, - retryDelay: 100, + retry: apiOptions.maxRetries, + retryDelay: () => { + // TODO backoff + return apiOptions.retryInterval; + }, headers: { 'Content-Type': 'application/octet-stream', Authorization: `Bearer ${options.token}`, @@ -56,6 +69,103 @@ export async function predictHTTP( createdAt: created_at, isBargein: is_bargein, probabilities, - predictionDuration: (performance.now() - createdAt) / 1000, + predictionDurationInS: (performance.now() - createdAt) / 1000, }; } + +export interface HttpTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + threshold: number; + minFrames: number; + timeout: number; +} + +export interface HttpTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: Map; +} + +/** + * Creates an HTTP transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * Each audio slice triggers an HTTP POST request. + */ +export function createHttpTransport( + options: HttpTransportOptions, + getState: () => HttpTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, +): TransformStream { + const logger = log(); + + return new TransformStream( + { + async transform(chunk, controller) { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + if (!state.overlapSpeechStartedAt) return; + + try { + const resp = await predictHTTP( + chunk, + { threshold: options.threshold, minFrames: options.minFrames }, + { + baseUrl: options.baseUrl, + timeout: options.timeout, + token: await createAccessToken(options.apiKey, options.apiSecret), + }, + ); + + const { createdAt, isBargein, probabilities, predictionDurationInS } = resp; + const entry = new InterruptionCacheEntry({ + createdAt, + probabilities, + isInterruption: isBargein, + speechInput: chunk, + totalDuration: (performance.now() - createdAt) / 1000, + detectionDelay: Date.now() - state.overlapSpeechStartedAt, + predictionDuration: predictionDurationInS, + }); + state.cache.set(createdAt, entry); + + if (state.overlapSpeechStarted && entry.isInterruption) { + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + overlapSpeechStartedAt: state.overlapSpeechStartedAt, + isInterruption: entry.isInterruption, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + totalDuration: entry.totalDuration, + predictionDuration: entry.predictionDuration, + detectionDelay: entry.detectionDelay, + probability: entry.probability, + }; + logger.debug( + { detectionDelay: entry.detectionDelay, totalDuration: entry.totalDuration }, + 'interruption detected', + ); + setState({ overlapSpeechStarted: false }); + controller.enqueue(event); + } + } catch (err) { + logger.error({ err }, 'Failed to send audio data over HTTP'); + } + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); +} diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/interruption.ts index 52783c895..23d04fc8c 100644 --- a/agents/src/inference/interruption/interruption.ts +++ b/agents/src/inference/interruption/interruption.ts @@ -1,5 +1,5 @@ import { slidingWindowMinMax } from '../utils.js'; -import { MIN_INTERRUPTION_DURATION } from './defaults.js'; +import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; export enum InterruptionEventType { INTERRUPTION = 'interruption', @@ -40,9 +40,9 @@ export class InterruptionDetectionError extends Error { function estimateProbability( probabilities: number[], - windowSize: number = MIN_INTERRUPTION_DURATION, + windowSizeInS: number = MIN_INTERRUPTION_DURATION_IN_S, ): number { - const minWindow = Math.ceil(windowSize / 0.025); // 25ms per frame + const minWindow = Math.ceil(windowSizeInS / FRAME_DURATION_IN_S); if (probabilities.length < minWindow) { return 0; } diff --git a/agents/src/utils/ws_transport.test.ts b/agents/src/inference/interruption/ws_transport.test.ts similarity index 100% rename from agents/src/utils/ws_transport.test.ts rename to agents/src/inference/interruption/ws_transport.test.ts diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts new file mode 100644 index 000000000..e4b649bb5 --- /dev/null +++ b/agents/src/inference/interruption/ws_transport.ts @@ -0,0 +1,346 @@ +import { Readable, Writable } from 'node:stream'; +import { TransformStream } from 'stream/web'; +import WebSocket, { createWebSocketStream } from 'ws'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { + InterruptionCacheEntry, + type InterruptionEvent, + InterruptionEventType, +} from './interruption.js'; + +// WebSocket message types +const MSG_SESSION_CREATE = 'session.create'; +const MSG_SESSION_CLOSE = 'session.close'; +const MSG_SESSION_CREATED = 'session.created'; +const MSG_SESSION_CLOSED = 'session.closed'; +const MSG_INTERRUPTION_DETECTED = 'bargein_detected'; +const MSG_INFERENCE_DONE = 'inference_done'; +const MSG_ERROR = 'error'; + +export interface WsTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; +} + +export interface WsTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: Map; +} + +interface WsMessage { + type: string; + created_at?: number; + probabilities?: number[]; + prediction_duration?: number; + is_bargein?: boolean; + error?: string; +} + +/** + * Creates a WebSocket connection and returns web-standard streams. + */ +async function connectWebSocket(options: WsTransportOptions): Promise<{ + readable: ReadableStream; + writable: WritableStream; + ws: WebSocket; +}> { + const baseUrl = options.baseUrl.replace(/^http/, 'ws'); + const url = `${baseUrl}/bargein`; + const token = await createAccessToken(options.apiKey, options.apiSecret); + + const ws = new WebSocket(url, { + headers: { Authorization: `Bearer ${token}` }, + }); + + await new Promise((resolve, reject) => { + const timeout = setTimeout( + () => reject(new Error('WebSocket connection timeout')), + options.timeout, + ); + ws.once('open', () => { + clearTimeout(timeout); + resolve(); + }); + ws.once('error', (err) => { + clearTimeout(timeout); + reject(err); + }); + }); + + const duplex = createWebSocketStream(ws); + duplex.on('error', (err) => log().error({ err }, 'WebSocket stream error')); + + // End the write side when the read side ends + duplex.on('end', () => duplex.end()); + + const writable = Writable.toWeb(duplex) as WritableStream; + const readable = Readable.toWeb(duplex) as ReadableStream; + + return { readable, writable, ws }; +} + +/** + * Creates a WebSocket transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * It maintains a persistent WebSocket connection. + */ +export function createWsTransport( + options: WsTransportOptions, + getState: () => WsTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, +): TransformStream { + const logger = log(); + let ws: WebSocket | null = null; + let writer: WritableStreamDefaultWriter | null = null; + let readerTask: Promise | null = null; + let outputController: TransformStreamDefaultController | null = null; + + async function ensureConnection(): Promise { + if (ws && ws.readyState === WebSocket.OPEN) return; + + const conn = await connectWebSocket(options); + ws = conn.ws; + writer = conn.writable.getWriter(); + + // Send session.create message + const sessionCreateMsg = JSON.stringify({ + type: MSG_SESSION_CREATE, + settings: { + sample_rate: options.sampleRate, + num_channels: 1, + threshold: options.threshold, + min_frames: options.minFrames, + encoding: 's16le', + }, + }); + await writer.write(new TextEncoder().encode(sessionCreateMsg)); + + // Start reading responses + readerTask = processResponses(conn.readable); + } + + async function processResponses(readable: ReadableStream): Promise { + const reader = readable.getReader(); + const decoder = new TextDecoder(); + let buffer = ''; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + + // Process complete JSON messages (newline-delimited or single messages) + const lines = buffer.split('\n'); + buffer = lines.pop() ?? ''; + + for (const line of lines) { + if (line.trim()) { + try { + const message: WsMessage = JSON.parse(line); + handleMessage(message); + } catch { + // Try parsing the whole buffer as a single message + try { + const message: WsMessage = JSON.parse(line); + handleMessage(message); + } catch { + logger.warn({ line }, 'Failed to parse WebSocket message'); + } + } + } + } + + // Also try parsing buffer as complete message (for non-newline-delimited) + if (buffer.trim()) { + try { + const message: WsMessage = JSON.parse(buffer); + handleMessage(message); + buffer = ''; + } catch { + // Incomplete message, keep buffering + } + } + } + } finally { + reader.releaseLock(); + } + } + + function handleMessage(message: WsMessage): void { + const state = getState(); + + switch (message.type) { + case MSG_SESSION_CREATED: + logger.debug('WebSocket session created'); + break; + + case MSG_INTERRUPTION_DETECTED: { + const createdAt = message.created_at ?? 0; + if (state.overlapSpeechStarted && state.overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const entry = new InterruptionCacheEntry({ + createdAt, + speechInput: existing?.speechInput, + totalDuration: (performance.now() - createdAt) / 1000, + probabilities: message.probabilities, + isInterruption: true, + predictionDuration: message.prediction_duration ?? 0, + detectionDelay: Date.now() - state.overlapSpeechStartedAt, + }); + state.cache.set(createdAt, entry); + + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + + logger.debug( + { + totalDuration: entry.totalDuration, + predictionDuration: entry.predictionDuration, + detectionDelay: entry.detectionDelay, + probability: entry.probability, + }, + 'interruption detected', + ); + + const event: InterruptionEvent = { + type: InterruptionEventType.INTERRUPTION, + timestamp: Date.now(), + isInterruption: true, + totalDuration: entry.totalDuration, + predictionDuration: entry.predictionDuration, + overlapSpeechStartedAt: state.overlapSpeechStartedAt, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + detectionDelay: entry.detectionDelay, + probability: entry.probability, + }; + + outputController?.enqueue(event); + setState({ overlapSpeechStarted: false }); + } + break; + } + + case MSG_INFERENCE_DONE: { + const createdAt = message.created_at ?? 0; + if (state.overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const entry = new InterruptionCacheEntry({ + createdAt, + speechInput: existing?.speechInput, + totalDuration: (performance.now() - createdAt) / 1000, + predictionDuration: message.prediction_duration ?? 0, + probabilities: message.probabilities, + isInterruption: message.is_bargein ?? false, + detectionDelay: Date.now() - state.overlapSpeechStartedAt, + }); + state.cache.set(createdAt, entry); + + logger.trace( + { + totalDuration: entry.totalDuration, + predictionDuration: entry.predictionDuration, + }, + 'interruption inference done', + ); + } + break; + } + + case MSG_SESSION_CLOSED: + logger.debug('WebSocket session closed'); + break; + + case MSG_ERROR: + logger.error({ error: message.error }, 'WebSocket error message received'); + outputController?.error(new Error(`LiveKit Interruption error: ${message.error}`)); + break; + + default: + logger.warn({ type: message.type }, 'Received unexpected WebSocket message type'); + } + } + + async function sendAudioData(audioSlice: Int16Array): Promise { + await ensureConnection(); + if (!writer) throw new Error('WebSocket not connected'); + + const state = getState(); + const createdAt = performance.now(); + + // Store the audio data in cache + state.cache.set(createdAt, new InterruptionCacheEntry({ createdAt, speechInput: audioSlice })); + + // Create header: 8-byte little-endian uint64 timestamp (milliseconds as integer) + const header = new ArrayBuffer(8); + const view = new DataView(header); + const createdAtInt = Math.floor(createdAt); + view.setUint32(0, createdAtInt >>> 0, true); + view.setUint32(4, Math.floor(createdAtInt / 0x100000000) >>> 0, true); + + // Combine header and audio data + const audioBytes = new Uint8Array( + audioSlice.buffer, + audioSlice.byteOffset, + audioSlice.byteLength, + ); + const combined = new Uint8Array(8 + audioBytes.length); + combined.set(new Uint8Array(header), 0); + combined.set(audioBytes, 8); + + await writer.write(combined); + } + + async function close(): Promise { + if (writer && ws?.readyState === WebSocket.OPEN) { + const closeMsg = JSON.stringify({ type: MSG_SESSION_CLOSE }); + await writer.write(new TextEncoder().encode(closeMsg)); + writer.releaseLock(); + } + ws?.close(1000); + await readerTask; + } + + return new TransformStream( + { + start(controller) { + outputController = controller; + }, + + async transform(chunk, controller) { + // Pass through InterruptionEvents unchanged + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + if (!state.overlapSpeechStartedAt) return; + + try { + await sendAudioData(chunk); + } catch (err) { + logger.error({ err }, 'Failed to send audio data over WebSocket'); + } + }, + + async flush() { + await close(); + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); +} diff --git a/agents/src/utils/ws_transport.ts b/agents/src/utils/ws_transport.ts deleted file mode 100644 index 4af4f906b..000000000 --- a/agents/src/utils/ws_transport.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { Readable, Writable } from 'node:stream'; -import WebSocket, { createWebSocketStream } from 'ws'; - -export function webSocketStream(wsUrl: string) { - const ws = new WebSocket(wsUrl); - const duplex = createWebSocketStream(ws); - duplex.on('error', console.error); - - // End the write side when the read side ends to properly close the stream. - // This is needed because Readable.toWeb() waits for both sides of the duplex - // to close before signaling done on the ReadableStream. - duplex.on('end', () => { - duplex.end(); - }); - - // Convert the writable side - const writable = Writable.toWeb(duplex); - // Convert the readable side - const readable = Readable.toWeb(duplex); - - return { readable, writable, close: ws.close }; -} From aee3612137f7289d034641a6556c31180a8a93e8 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 10:02:33 +0100 Subject: [PATCH 14/25] renaming and update transport tests --- .../interruption/InterruptionStream.ts | 12 +- .../inference/interruption/http_transport.ts | 23 ++-- .../inference/interruption/interruption.ts | 24 ++-- .../interruption/ws_transport.test.ts | 127 ++++++------------ .../inference/interruption/ws_transport.ts | 52 +++---- 5 files changed, 103 insertions(+), 135 deletions(-) diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 7071ebb00..1a2f6eb29 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -84,9 +84,9 @@ function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { (entry.isInterruption ?? false).toString().toLowerCase(), ); span.setAttribute(traceTypes.ATTR_INTERRUPTION_PROBABILITY, entry.probability); - span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDuration); - span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDuration); - span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelay); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelayInS); } export class InterruptionStreamBase { @@ -233,9 +233,9 @@ export class InterruptionStreamBase { overlapSpeechStartedAt: this.overlapSpeechStartedAt, speechInput: latestEntry.speechInput, probabilities: latestEntry.probabilities, - totalDuration: latestEntry.totalDuration, - detectionDelay: latestEntry.detectionDelay, - predictionDuration: latestEntry.predictionDuration, + totalDurationInS: latestEntry.totalDurationInS, + detectionDelayInS: latestEntry.detectionDelayInS, + predictionDurationInS: latestEntry.predictionDurationInS, probability: latestEntry.probability, }; controller.enqueue(event); diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 82f9726b6..7ab424c68 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -2,7 +2,6 @@ import { ofetch } from 'ofetch'; import { TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; -import type { ApiConnectOptions } from './InterruptionStream.js'; import { InterruptionCacheEntry, type InterruptionEvent, @@ -38,7 +37,6 @@ export async function predictHTTP( data: Int16Array, predictOptions: PredictOptions, options: PostOptions, - apiOptions: ApiConnectOptions, ): Promise { const createdAt = performance.now(); const url = new URL(`/bargein`, options.baseUrl); @@ -49,10 +47,10 @@ export async function predictHTTP( const { created_at, is_bargein, probabilities } = await ofetch( url.toString(), { - retry: apiOptions.maxRetries, + retry: 1, retryDelay: () => { // TODO backoff - return apiOptions.retryInterval; + return 500; }, headers: { 'Content-Type': 'application/octet-stream', @@ -131,9 +129,9 @@ export function createHttpTransport( probabilities, isInterruption: isBargein, speechInput: chunk, - totalDuration: (performance.now() - createdAt) / 1000, - detectionDelay: Date.now() - state.overlapSpeechStartedAt, - predictionDuration: predictionDurationInS, + totalDurationInS: (performance.now() - createdAt) / 1000, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, + predictionDurationInS, }); state.cache.set(createdAt, entry); @@ -148,13 +146,16 @@ export function createHttpTransport( isInterruption: entry.isInterruption, speechInput: entry.speechInput, probabilities: entry.probabilities, - totalDuration: entry.totalDuration, - predictionDuration: entry.predictionDuration, - detectionDelay: entry.detectionDelay, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, probability: entry.probability, }; logger.debug( - { detectionDelay: entry.detectionDelay, totalDuration: entry.totalDuration }, + { + detectionDelayInS: entry.detectionDelayInS, + totalDurationInS: entry.totalDurationInS, + }, 'interruption detected', ); setState({ overlapSpeechStarted: false }); diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/interruption.ts index 23d04fc8c..0d5e23e55 100644 --- a/agents/src/inference/interruption/interruption.ts +++ b/agents/src/inference/interruption/interruption.ts @@ -9,9 +9,9 @@ export interface InterruptionEvent { type: InterruptionEventType; timestamp: number; isInterruption: boolean; - totalDuration: number; - predictionDuration: number; - detectionDelay: number; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; overlapSpeechStartedAt?: number; speechInput?: Int16Array; probabilities?: number[]; @@ -55,9 +55,9 @@ function estimateProbability( */ export class InterruptionCacheEntry { readonly createdAt: number; - readonly totalDuration: number; - readonly predictionDuration: number; - readonly detectionDelay: number; + readonly totalDurationInS: number; + readonly predictionDurationInS: number; + readonly detectionDelayInS: number; readonly speechInput?: Int16Array; readonly probabilities?: number[]; readonly isInterruption?: boolean; @@ -66,16 +66,16 @@ export class InterruptionCacheEntry { constructor(params: { createdAt: number; speechInput?: Int16Array; - totalDuration?: number; - predictionDuration?: number; - detectionDelay?: number; + totalDurationInS?: number; + predictionDurationInS?: number; + detectionDelayInS?: number; probabilities?: number[]; isInterruption?: boolean; }) { this.createdAt = params.createdAt; - this.totalDuration = params.totalDuration ?? 0; - this.predictionDuration = params.predictionDuration ?? 0; - this.detectionDelay = params.detectionDelay ?? 0; + this.totalDurationInS = params.totalDurationInS ?? 0; + this.predictionDurationInS = params.predictionDurationInS ?? 0; + this.detectionDelayInS = params.detectionDelayInS ?? 0; this.speechInput = params.speechInput; this.probabilities = params.probabilities; this.isInterruption = params.isInterruption; diff --git a/agents/src/inference/interruption/ws_transport.test.ts b/agents/src/inference/interruption/ws_transport.test.ts index 77c5fdc91..e44f62fdb 100644 --- a/agents/src/inference/interruption/ws_transport.test.ts +++ b/agents/src/inference/interruption/ws_transport.test.ts @@ -3,16 +3,31 @@ // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from 'vitest'; import { WebSocket, WebSocketServer } from 'ws'; -import { webSocketStream } from './ws_transport.js'; +import { webSocketToStream } from './ws_transport.js'; -describe('webSocketStream', () => { +/** Helper to create a WebSocket server and return its port */ +async function createServer(): Promise<{ wss: WebSocketServer; port: number }> { + const wss = await new Promise((resolve) => { + const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); + }); + const port = (wss.address() as { port: number }).port; + return { wss, port }; +} + +/** Helper to create a connected WebSocket client */ +async function createClient(port: number): Promise { + const ws = new WebSocket(`ws://localhost:${port}`); + // await new Promise((resolve, reject) => { + // ws.once('open', resolve); + // ws.once('error', reject); + // }); + return ws; +} + +describe('webSocketToStream', () => { describe('readable stream', () => { it('receives messages from the WebSocket', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; + const { wss, port } = await createServer(); wss.on('connection', (serverWs) => { serverWs.send('hello'); @@ -20,7 +35,8 @@ describe('webSocketStream', () => { serverWs.close(); }); - const { readable } = webSocketStream(`ws://localhost:${port}`); + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); const reader = readable.getReader(); const messages: string[] = []; @@ -40,11 +56,7 @@ describe('webSocketStream', () => { }); it('handles binary messages', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; + const { wss, port } = await createServer(); const binaryData = new Uint8Array([1, 2, 3, 4, 5]); @@ -53,7 +65,8 @@ describe('webSocketStream', () => { serverWs.close(); }); - const { readable } = webSocketStream(`ws://localhost:${port}`); + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); const reader = readable.getReader(); const chunks: Uint8Array[] = []; @@ -74,16 +87,14 @@ describe('webSocketStream', () => { }); it('handles empty stream when connection closes immediately', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; + const { wss, port } = await createServer(); wss.on('connection', (serverWs) => { serverWs.close(); }); - const { readable } = webSocketStream(`ws://localhost:${port}`); + + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); const reader = readable.getReader(); const chunks: Uint8Array[] = []; @@ -105,16 +116,7 @@ describe('webSocketStream', () => { describe('writable stream', () => { it('sends messages through the WebSocket', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; - const ws = new WebSocket(`ws://localhost:${port}`); - - const connected = new Promise((resolve) => { - ws.on('open', resolve); - }); + const { wss, port } = await createServer(); const messagesReceived: string[] = []; const serverClosed = new Promise((resolve) => { @@ -126,8 +128,8 @@ describe('webSocketStream', () => { }); }); - await connected; - const { writable } = webSocketStream(`ws://localhost:${port}`); + const ws = await createClient(port); + const { writable } = webSocketToStream(ws); const writer = writable.getWriter(); await writer.write(new TextEncoder().encode('hello')); @@ -142,11 +144,7 @@ describe('webSocketStream', () => { }); it('sends binary data through the WebSocket', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; + const { wss, port } = await createServer(); const chunksReceived: Buffer[] = []; const serverClosed = new Promise((resolve) => { @@ -158,7 +156,8 @@ describe('webSocketStream', () => { }); }); - const { writable } = webSocketStream(`ws://localhost:${port}`); + const ws = await createClient(port); + const { writable } = webSocketToStream(ws); const writer = writable.getWriter(); const binaryData = new Uint8Array([10, 20, 30, 40, 50]); @@ -172,46 +171,11 @@ describe('webSocketStream', () => { wss.close(); }); - - it('buffers writes if readyState is CONNECTING', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; - - const { writable } = webSocketStream(`ws://localhost:${port}`); - const writer = writable.getWriter(); - - const messagesReceived: string[] = []; - const serverClosed = new Promise((resolve) => { - wss.on('connection', (serverWs) => { - serverWs.on('message', (data) => { - messagesReceived.push(data.toString()); - }); - serverWs.on('close', resolve); - }); - }); - - // These writes should be buffered - await writer.write(new TextEncoder().encode('buffered message')); - await writer.close(); - - await serverClosed; - - expect(messagesReceived).toEqual(['buffered message']); - - wss.close(); - }); }); describe('bidirectional communication', () => { it('supports echo pattern with readable and writable', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; + const { wss, port } = await createServer(); // Server echoes messages back wss.on('connection', (serverWs) => { @@ -220,7 +184,8 @@ describe('webSocketStream', () => { }); }); - const { readable, writable } = webSocketStream(`ws://localhost:${port}`); + const ws = await createClient(port); + const { readable, writable } = webSocketToStream(ws); const writer = writable.getWriter(); const reader = readable.getReader(); @@ -244,11 +209,7 @@ describe('webSocketStream', () => { describe('error handling', () => { it('readable stream ends when WebSocket closes unexpectedly', async () => { - const wss = await new Promise((resolve) => { - const server: WebSocketServer = new WebSocketServer({ port: 0 }, () => resolve(server)); - }); - - const port = (wss.address() as { port: number }).port; + const { wss, port } = await createServer(); wss.on('connection', (serverWs) => { serverWs.send('before close'); @@ -256,7 +217,8 @@ describe('webSocketStream', () => { serverWs.terminate(); }); - const { readable } = webSocketStream(`ws://localhost:${port}`); + const ws = await createClient(port); + const { readable } = webSocketToStream(ws); const reader = readable.getReader(); const chunks: string[] = []; @@ -266,8 +228,7 @@ describe('webSocketStream', () => { if (done) break; chunks.push(Buffer.from(value).toString()); } - } catch (error) { - console.error(error); + } catch { // Connection terminated, stream may error } finally { reader.releaseLock(); diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index e4b649bb5..083e1669d 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -43,6 +43,19 @@ interface WsMessage { error?: string; } +export function webSocketToStream(ws: WebSocket) { + const duplex = createWebSocketStream(ws); + duplex.on('error', (err) => log().error({ err }, 'WebSocket stream error')); + + // End the write side when the read side ends + duplex.on('end', () => duplex.end()); + + const writable = Writable.toWeb(duplex) as WritableStream; + const readable = Readable.toWeb(duplex) as ReadableStream; + + return { readable, writable }; +} + /** * Creates a WebSocket connection and returns web-standard streams. */ @@ -59,6 +72,8 @@ async function connectWebSocket(options: WsTransportOptions): Promise<{ headers: { Authorization: `Bearer ${token}` }, }); + const { readable, writable } = webSocketToStream(ws); + await new Promise((resolve, reject) => { const timeout = setTimeout( () => reject(new Error('WebSocket connection timeout')), @@ -74,15 +89,6 @@ async function connectWebSocket(options: WsTransportOptions): Promise<{ }); }); - const duplex = createWebSocketStream(ws); - duplex.on('error', (err) => log().error({ err }, 'WebSocket stream error')); - - // End the write side when the read side ends - duplex.on('end', () => duplex.end()); - - const writable = Writable.toWeb(duplex) as WritableStream; - const readable = Readable.toWeb(duplex) as ReadableStream; - return { readable, writable, ws }; } @@ -192,11 +198,11 @@ export function createWsTransport( const entry = new InterruptionCacheEntry({ createdAt, speechInput: existing?.speechInput, - totalDuration: (performance.now() - createdAt) / 1000, + totalDurationInS: (performance.now() - createdAt) / 1000, probabilities: message.probabilities, isInterruption: true, - predictionDuration: message.prediction_duration ?? 0, - detectionDelay: Date.now() - state.overlapSpeechStartedAt, + predictionDurationInS: message.prediction_duration ?? 0, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, }); state.cache.set(createdAt, entry); @@ -206,9 +212,9 @@ export function createWsTransport( logger.debug( { - totalDuration: entry.totalDuration, - predictionDuration: entry.predictionDuration, - detectionDelay: entry.detectionDelay, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, probability: entry.probability, }, 'interruption detected', @@ -218,12 +224,12 @@ export function createWsTransport( type: InterruptionEventType.INTERRUPTION, timestamp: Date.now(), isInterruption: true, - totalDuration: entry.totalDuration, - predictionDuration: entry.predictionDuration, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, overlapSpeechStartedAt: state.overlapSpeechStartedAt, speechInput: entry.speechInput, probabilities: entry.probabilities, - detectionDelay: entry.detectionDelay, + detectionDelayInS: entry.detectionDelayInS, probability: entry.probability, }; @@ -240,18 +246,18 @@ export function createWsTransport( const entry = new InterruptionCacheEntry({ createdAt, speechInput: existing?.speechInput, - totalDuration: (performance.now() - createdAt) / 1000, - predictionDuration: message.prediction_duration ?? 0, + totalDurationInS: (performance.now() - createdAt) / 1000, + predictionDurationInS: message.prediction_duration ?? 0, probabilities: message.probabilities, isInterruption: message.is_bargein ?? false, - detectionDelay: Date.now() - state.overlapSpeechStartedAt, + detectionDelayInS: (Date.now() - state.overlapSpeechStartedAt) / 1000, }); state.cache.set(createdAt, entry); logger.trace( { - totalDuration: entry.totalDuration, - predictionDuration: entry.predictionDuration, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, }, 'interruption inference done', ); From 1f3c315edf3941c8d78d9911b49f6415d2b501d8 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 10:30:44 +0100 Subject: [PATCH 15/25] add missing features --- .../AdaptiveInterruptionDetector.ts | 88 +++++++++--- .../interruption/InterruptionStream.ts | 51 +++++-- agents/src/inference/interruption/defaults.ts | 17 ++- .../inference/interruption/http_transport.ts | 15 +- .../inference/interruption/interruption.ts | 129 ++++++++++++++++-- .../inference/interruption/ws_transport.ts | 87 +++++++++--- agents/src/voice/agent_activity.ts | 4 +- 7 files changed, 324 insertions(+), 67 deletions(-) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index c20da246d..0e137b15c 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -8,11 +8,12 @@ import { SAMPLE_RATE, interruptionOptionDefaults, } from './defaults.js'; -import { type InterruptionDetectionError } from './interruption.js'; +import { type InterruptionDetectionError, type InterruptionEvent } from './interruption.js'; type InterruptionCallbacks = { - interruptionDetected: () => void; - overlapSpeechDetected: () => void; + userInterruptionDetected: (event: InterruptionEvent) => void; + userNonInterruptionDetected: (event: InterruptionEvent) => void; + overlapSpeechEnded: (event: InterruptionEvent) => void; error: (error: InterruptionDetectionError) => void; }; @@ -35,8 +36,10 @@ export type AdaptiveInterruptionDetectorOptions = Partial; export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { options: InterruptionOptions; + private readonly _label: string; private logger = log(); - private streams: WeakSet; // TODO: Union of InterruptionHttpStream | InterruptionWebSocketStream + // Use Set instead of WeakSet to allow iteration for propagating option updates + private streams: Set = new Set(); constructor(options: AdaptiveInterruptionDetectorOptions = {}) { super(); @@ -46,7 +49,6 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ baseUrl, apiKey, apiSecret, - useProxy: useProxyArg, audioPrefixDurationInS, threshold, detectionIntervalInS, @@ -86,7 +88,8 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ useProxy = true; } else { - useProxy = useProxyArg ?? false; + // Force useProxy to false for custom URLs (matching Python behavior) + useProxy = false; } this.options = { @@ -104,7 +107,7 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ minInterruptionDurationInS, }; - this.streams = new WeakSet(); + this._label = `${this.constructor.name}`; this.logger.debug( { @@ -121,6 +124,41 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ ); } + /** + * The model identifier for this detector. + */ + get model(): string { + return 'adaptive interruption'; + } + + /** + * The provider identifier for this detector. + */ + get provider(): string { + return 'livekit'; + } + + /** + * The label for this detector instance. + */ + get label(): string { + return this._label; + } + + /** + * The sample rate used for audio processing. + */ + get sampleRate(): number { + return this.options.sampleRate; + } + + /** + * Emit an error event from the detector. + */ + emitError(error: InterruptionDetectionError): void { + this.emit('error', error); + } + /** * Creates a new InterruptionStreamBase for internal use. * The stream can receive audio frames and sentinels via pushFrame(). @@ -129,27 +167,37 @@ export class AdaptiveInterruptionDetector extends (EventEmitter as new () => Typ createStream(): InterruptionStreamBase { const streamBase = new InterruptionStreamBase(this, {}); this.streams.add(streamBase); - // const transformer = new TransformStream({ - // transform: (chunk, controller) => { - // log().info('adaptive interruption detection stream transformer', chunk); - // if (chunk.type === InterruptionEventType.INTERRUPTION) { - // this.emit('interruptionDetected'); // TODO payload - // } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { - // this.emit('overlapSpeechDetected'); // TODO payload - // } - // controller.enqueue(chunk); - // }, - // }); - // streamBase.stream().pipeThrough(transformer); return streamBase; } - updateOptions(options: { threshold?: number; minInterruptionDurationInS?: number }): void { + /** + * Remove a stream from tracking (called when stream is closed). + */ + removeStream(stream: InterruptionStreamBase): void { + this.streams.delete(stream); + } + + /** + * Update options for the detector and propagate to all active streams. + * For WebSocket streams, this triggers a reconnection with new settings. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { if (options.threshold !== undefined) { this.options.threshold = options.threshold; } if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); } + + // Propagate option updates to all active streams (matching Python behavior) + const updatePromises: Promise[] = []; + for (const stream of this.streams) { + updatePromises.push(stream.updateOptions(options)); + } + await Promise.all(updatePromises); } } diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 1a2f6eb29..7d0bd8142 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -8,9 +8,10 @@ import type { AdaptiveInterruptionDetector, InterruptionOptions, } from './AdaptiveInterruptionDetector.js'; -import { apiConnectDefaults } from './defaults.js'; +import { FRAMES_PER_SECOND, apiConnectDefaults } from './defaults.js'; import { createHttpTransport } from './http_transport.js'; import { + BoundedCache, InterruptionCacheEntry, type InterruptionDetectionError, type InterruptionEvent, @@ -108,6 +109,9 @@ export class InterruptionStreamBase { private logger = log(); + // Store reconnect function for WebSocket transport + private wsReconnect?: () => Promise; + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { this.inputStream = createStreamChannel< InterruptionSentinel | AudioFrame, @@ -115,18 +119,39 @@ export class InterruptionStreamBase { >(); this.model = model; - this.options = model.options; + this.options = { ...model.options }; this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; this.eventStream = this.setupTransform(); } + /** + * Update stream options. For WebSocket transport, this triggers a reconnection. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + } + // Trigger WebSocket reconnection if using proxy (WebSocket transport) + if (this.options.useProxy && this.wsReconnect) { + await this.wsReconnect(); + } + } + private setupTransform(): ReadableStream { let agentSpeechStarted = false; let startIdx = 0; let accumulatedSamples = 0; let overlapSpeechStarted = false; - const cache = new Map(); + // Use BoundedCache with max_len=10 to prevent unbounded memory growth + const cache = new BoundedCache(10); const inferenceS16Data = new Int16Array( Math.ceil(this.options.maxAudioDurationInS * this.options.sampleRate), ).fill(0); @@ -219,12 +244,14 @@ export class InterruptionStreamBase { this.logger.debug('overlap speech ended'); if (overlapSpeechStarted) { this.userSpeakingSpan = undefined; - let latestEntry = Array.from(cache.values()).at(-1); + // Use pop with predicate to get only completed requests (matching Python behavior) + // This ensures we don't return incomplete/in-flight requests as the "final" result + let latestEntry = cache.pop( + (entry) => entry.totalDurationInS !== undefined && entry.totalDurationInS > 0, + ); if (!latestEntry) { this.logger.debug('no request made for overlap speech'); latestEntry = InterruptionCacheEntry.default(); - } else { - cache.delete(latestEntry.createdAt); } const event: InterruptionEvent = { type: InterruptionEventType.OVERLAP_SPEECH_ENDED, @@ -259,11 +286,17 @@ export class InterruptionStreamBase { threshold: this.options.threshold, minFrames: this.options.minFrames, timeout: this.options.inferenceTimeout, + maxRetries: this.apiOptions.maxRetries, }; - const transport = this.options.useProxy - ? createWsTransport(transportOptions, getState, setState, handleSpanUpdate) - : createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); + let transport: TransformStream; + if (this.options.useProxy) { + const wsResult = createWsTransport(transportOptions, getState, setState, handleSpanUpdate); + transport = wsResult.transport; + this.wsReconnect = wsResult.reconnect; + } else { + transport = createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); + } // Pipeline: input -> audioTransformer -> transport -> eventStream return this.inputStream.stream().pipeThrough(audioTransformer).pipeThrough(transport); diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 60529b7d2..1a2beeb08 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -10,7 +10,7 @@ export const REMOTE_INFERENCE_TIMEOUT_IN_S = 1.0; export const SAMPLE_RATE = 16000; export const FRAMES_PER_SECOND = 40; export const FRAME_DURATION_IN_S = 0.025; // 25ms per frame -export const DEFAULT_BASE_URL = 'http://localhost:8080'; +export const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; export const apiConnectDefaults: ApiConnectOptions = { maxRetries: 3, @@ -18,6 +18,21 @@ export const apiConnectDefaults: ApiConnectOptions = { timeout: 10_000, } as const; +/** + * Calculate the retry interval using exponential backoff with jitter. + * Matches the Python implementation's _interval_for_retry behavior. + */ +export function intervalForRetry( + attempt: number, + baseInterval: number = apiConnectDefaults.retryInterval, +): number { + // Exponential backoff: baseInterval * 2^attempt with some jitter + const exponentialDelay = baseInterval * Math.pow(2, attempt); + // Add jitter (0-25% of the delay) + const jitter = exponentialDelay * Math.random() * 0.25; + return exponentialDelay + jitter; +} + export const interruptionOptionDefaults: InterruptionOptions = { sampleRate: SAMPLE_RATE, threshold: THRESHOLD, diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 7ab424c68..b2419a514 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -2,7 +2,9 @@ import { ofetch } from 'ofetch'; import { TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; +import { intervalForRetry } from './defaults.js'; import { + BoundedCache, InterruptionCacheEntry, type InterruptionEvent, InterruptionEventType, @@ -13,6 +15,7 @@ export interface PostOptions { token: string; signal?: AbortSignal; timeout?: number; + maxRetries?: number; } export interface PredictOptions { @@ -44,13 +47,15 @@ export async function predictHTTP( url.searchParams.append('min_frames', predictOptions.minFrames.toFixed()); url.searchParams.append('created_at', createdAt.toFixed()); + let retryCount = 0; const { created_at, is_bargein, probabilities } = await ofetch( url.toString(), { - retry: 1, + retry: options.maxRetries ?? 3, retryDelay: () => { - // TODO backoff - return 500; + const delay = intervalForRetry(retryCount); + retryCount++; + return delay; }, headers: { 'Content-Type': 'application/octet-stream', @@ -78,12 +83,13 @@ export interface HttpTransportOptions { threshold: number; minFrames: number; timeout: number; + maxRetries?: number; } export interface HttpTransportState { overlapSpeechStarted: boolean; overlapSpeechStartedAt: number | undefined; - cache: Map; + cache: BoundedCache; } /** @@ -119,6 +125,7 @@ export function createHttpTransport( { baseUrl: options.baseUrl, timeout: options.timeout, + maxRetries: options.maxRetries, token: await createAccessToken(options.apiKey, options.apiSecret), }, ); diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/interruption.ts index 0d5e23e55..5c17ff534 100644 --- a/agents/src/inference/interruption/interruption.ts +++ b/agents/src/inference/interruption/interruption.ts @@ -1,6 +1,111 @@ import { slidingWindowMinMax } from '../utils.js'; import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; +/** + * A bounded cache that automatically evicts the oldest entries when the cache exceeds max size. + * Uses FIFO eviction strategy. + */ +export class BoundedCache { + private cache: Map = new Map(); + private readonly maxLen: number; + + constructor(maxLen: number = 10) { + this.maxLen = maxLen; + } + + set(key: K, value: V): void { + this.cache.set(key, value); + if (this.cache.size > this.maxLen) { + // Remove the oldest entry (first inserted) + const firstKey = this.cache.keys().next().value as K; + this.cache.delete(firstKey); + } + } + + get(key: K): V | undefined { + return this.cache.get(key); + } + + has(key: K): boolean { + return this.cache.has(key); + } + + delete(key: K): boolean { + return this.cache.delete(key); + } + + /** + * Get existing entry and update it, or create a new one using factory. + * Updates the entry with the provided partial fields. + */ + setOrUpdate( + key: K, + factory: () => T, + updates: Partial<{ [P in keyof T]: T[P] }>, + ): T { + let entry = this.cache.get(key) as T | undefined; + if (entry === undefined) { + entry = factory(); + this.set(key, entry); + } + // Apply updates to the entry + for (const [field, value] of Object.entries(updates)) { + if (value !== undefined) { + (entry as Record)[field] = value; + } + } + return entry; + } + + /** + * Pop the last entry that matches the predicate, or return undefined. + * Only removes and returns the matching entry, preserving others. + */ + pop(predicate?: (value: V) => boolean): V | undefined { + if (predicate === undefined) { + // Pop the last (most recent) entry + const keys = Array.from(this.cache.keys()); + if (keys.length === 0) return undefined; + const lastKey = keys[keys.length - 1]!; + const value = this.cache.get(lastKey); + this.cache.delete(lastKey); + return value; + } + + // Find the last entry matching the predicate (iterating in reverse) + const keys = Array.from(this.cache.keys()); + for (let i = keys.length - 1; i >= 0; i--) { + const key = keys[i]!; + const value = this.cache.get(key)!; + if (predicate(value)) { + this.cache.delete(key); + return value; + } + } + return undefined; + } + + clear(): void { + this.cache.clear(); + } + + get size(): number { + return this.cache.size; + } + + values(): IterableIterator { + return this.cache.values(); + } + + keys(): IterableIterator { + return this.cache.keys(); + } + + entries(): IterableIterator<[K, V]> { + return this.cache.entries(); + } +} + export enum InterruptionEventType { INTERRUPTION = 'interruption', OVERLAP_SPEECH_ENDED = 'overlap_speech_ended', @@ -52,16 +157,16 @@ function estimateProbability( /** * Typed cache entry for interruption inference results. + * Mutable to support setOrUpdate pattern from Python's _BoundedCache. */ export class InterruptionCacheEntry { - readonly createdAt: number; - readonly totalDurationInS: number; - readonly predictionDurationInS: number; - readonly detectionDelayInS: number; - readonly speechInput?: Int16Array; - readonly probabilities?: number[]; - readonly isInterruption?: boolean; - readonly probability: number; + createdAt: number; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + speechInput?: Int16Array; + probabilities?: number[]; + isInterruption?: boolean; constructor(params: { createdAt: number; @@ -79,7 +184,13 @@ export class InterruptionCacheEntry { this.speechInput = params.speechInput; this.probabilities = params.probabilities; this.isInterruption = params.isInterruption; - this.probability = this.probabilities ? estimateProbability(this.probabilities) : 0; + } + + /** + * The conservative estimated probability of the interruption event. + */ + get probability(): number { + return this.probabilities ? estimateProbability(this.probabilities) : 0; } static default(): InterruptionCacheEntry { diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 083e1669d..3d76ff205 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -3,7 +3,9 @@ import { TransformStream } from 'stream/web'; import WebSocket, { createWebSocketStream } from 'ws'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; +import { intervalForRetry } from './defaults.js'; import { + type BoundedCache, InterruptionCacheEntry, type InterruptionEvent, InterruptionEventType, @@ -26,12 +28,13 @@ export interface WsTransportOptions { threshold: number; minFrames: number; timeout: number; + maxRetries?: number; } export interface WsTransportState { overlapSpeechStarted: boolean; overlapSpeechStartedAt: number | undefined; - cache: Map; + cache: BoundedCache; } interface WsMessage { @@ -92,18 +95,24 @@ async function connectWebSocket(options: WsTransportOptions): Promise<{ return { readable, writable, ws }; } +export interface WsTransportResult { + transport: TransformStream; + reconnect: () => Promise; +} + /** * Creates a WebSocket transport TransformStream for interruption detection. * * This transport receives Int16Array audio slices and outputs InterruptionEvents. - * It maintains a persistent WebSocket connection. + * It maintains a persistent WebSocket connection with automatic retry on failure. + * Returns both the transport and a reconnect function for option updates. */ export function createWsTransport( options: WsTransportOptions, getState: () => WsTransportState, setState: (partial: Partial) => void, updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, -): TransformStream { +): WsTransportResult { const logger = log(); let ws: WebSocket | null = null; let writer: WritableStreamDefaultWriter | null = null; @@ -113,25 +122,45 @@ export function createWsTransport( async function ensureConnection(): Promise { if (ws && ws.readyState === WebSocket.OPEN) return; - const conn = await connectWebSocket(options); - ws = conn.ws; - writer = conn.writable.getWriter(); - - // Send session.create message - const sessionCreateMsg = JSON.stringify({ - type: MSG_SESSION_CREATE, - settings: { - sample_rate: options.sampleRate, - num_channels: 1, - threshold: options.threshold, - min_frames: options.minFrames, - encoding: 's16le', - }, - }); - await writer.write(new TextEncoder().encode(sessionCreateMsg)); + const maxRetries = options.maxRetries ?? 3; + let lastError: Error | null = null; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + const conn = await connectWebSocket(options); + ws = conn.ws; + writer = conn.writable.getWriter(); + + // Send session.create message + const sessionCreateMsg = JSON.stringify({ + type: MSG_SESSION_CREATE, + settings: { + sample_rate: options.sampleRate, + num_channels: 1, + threshold: options.threshold, + min_frames: options.minFrames, + encoding: 's16le', + }, + }); + await writer.write(new TextEncoder().encode(sessionCreateMsg)); + + // Start reading responses + readerTask = processResponses(conn.readable); + return; + } catch (err) { + lastError = err instanceof Error ? err : new Error(String(err)); + if (attempt < maxRetries) { + const delay = intervalForRetry(attempt); + logger.warn( + { attempt, delay, err: lastError.message }, + 'WebSocket connection failed, retrying', + ); + await new Promise((resolve) => setTimeout(resolve, delay)); + } + } + } - // Start reading responses - readerTask = processResponses(conn.readable); + throw lastError ?? new Error('Failed to connect to WebSocket after retries'); } async function processResponses(readable: ReadableStream): Promise { @@ -314,12 +343,24 @@ export function createWsTransport( const closeMsg = JSON.stringify({ type: MSG_SESSION_CLOSE }); await writer.write(new TextEncoder().encode(closeMsg)); writer.releaseLock(); + writer = null; } ws?.close(1000); + ws = null; await readerTask; + readerTask = null; } - return new TransformStream( + /** + * Reconnect the WebSocket with updated options. + * This is called when options are updated via updateOptions(). + */ + async function reconnect(): Promise { + await close(); + // Connection will be re-established on next sendAudioData call + } + + const transport = new TransformStream( { start(controller) { outputController = controller; @@ -349,4 +390,6 @@ export function createWsTransport( { highWaterMark: 2 }, { highWaterMark: 2 }, ); + + return { transport, reconnect }; } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 1ec298678..94890b9f1 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -730,8 +730,8 @@ export class AgentActivity implements RecognitionHooks { this.logger.info( { probability: ev.probability, - detectionDelay: ev.detectionDelay, - totalDuration: ev.totalDuration, + detectionDelayInS: ev.detectionDelayInS, + totalDurationInS: ev.totalDurationInS, }, 'adaptive interruption detected', ); From 1aac5f72ce01bca70b2a154f9cf59909cf78fc8b Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 10:32:11 +0100 Subject: [PATCH 16/25] revert voice activity stuff --- agents/src/voice/agent_activity.ts | 61 ---------- agents/src/voice/agent_session.ts | 24 +--- agents/src/voice/audio_recognition.ts | 163 +------------------------- 3 files changed, 3 insertions(+), 245 deletions(-) diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 94890b9f1..c5b2b999c 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -41,8 +41,6 @@ import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js import { splitWords } from '../tokenize/basic/word.js'; import { TTS, type TTSError } from '../tts/tts.js'; import { Future, Task, cancelAndWait, waitFor } from '../utils.js'; -import type { InterruptionEvent } from '../inference/interruption/interruption.js'; -import { InterruptionEventType } from '../inference/interruption/interruption.js'; import { VAD, type VADEvent } from '../vad.js'; import type { Agent, ModelSettings } from './agent.js'; import { StopResponse, asyncLocalStorage } from './agent.js'; @@ -114,24 +112,6 @@ export class AgentActivity implements RecognitionHooks { _mainTask?: Task; _userTurnCompletedTask?: Promise; - /** - * Notify that agent started speaking. - * This enables interruption detection in AudioRecognition. - * @internal - */ - notifyAgentSpeechStarted(): void { - this.audioRecognition?.onStartOfAgentSpeech(); - } - - /** - * Notify that agent stopped speaking. - * This disables interruption detection in AudioRecognition. - * @internal - */ - notifyAgentSpeechEnded(): void { - this.audioRecognition?.onEndOfAgentSpeech(); - } - constructor(agent: Agent, agentSession: AgentSession) { this.agent = agent; this.agentSession = agentSession; @@ -312,7 +292,6 @@ export class AgentActivity implements RecognitionHooks { // Disable stt node if stt is not provided stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined, vad: this.vad, - interruptionDetector: this.agentSession.interruptionDetector, turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, minEndpointingDelay: this.agentSession.options.minEndpointingDelay, @@ -721,46 +700,6 @@ export class AgentActivity implements RecognitionHooks { } } - onInterruption(ev: InterruptionEvent): void { - if (ev.type !== InterruptionEventType.INTERRUPTION) { - // Only handle actual interruptions, not overlap_speech_ended events - return; - } - - this.logger.info( - { - probability: ev.probability, - detectionDelayInS: ev.detectionDelayInS, - totalDurationInS: ev.totalDurationInS, - }, - 'adaptive interruption detected', - ); - - // Similar to onVADInferenceDone but triggered by the adaptive interruption detector - if (this.turnDetection === 'manual' || this.turnDetection === 'realtime_llm') { - return; - } - - if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) { - return; - } - - this.realtimeSession?.startUserActivity(); - - if ( - this._currentSpeech && - !this._currentSpeech.interrupted && - this._currentSpeech.allowInterruptions - ) { - this.logger.info( - { 'speech id': this._currentSpeech.id }, - 'speech interrupted by adaptive interruption detector', - ); - this.realtimeSession?.interrupt(); - this._currentSpeech.interrupt(); - } - } - onInterimTranscript(ev: SpeechEvent): void { if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) { // skip stt transcription if userTranscription is enabled on the realtime model diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index c9b194d00..29eae5a3f 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -15,7 +15,6 @@ import { type STTModelString, type TTSModelString, } from '../inference/index.js'; -import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; import { type JobContext, getJobContext } from '../job.js'; import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; @@ -108,7 +107,6 @@ export type AgentSessionOptions = { vad?: VAD; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; - interruptionDetector?: AdaptiveInterruptionDetector; userData?: UserData; voiceOptions?: Partial; connOptions?: SessionConnectOptions; @@ -170,8 +168,6 @@ export class AgentSession< /** @internal - Timestamp when the session started (milliseconds) */ _startedAt?: number; - interruptionDetector?: AdaptiveInterruptionDetector; - /** @internal - Current run state for testing */ _globalRunState?: RunResult; @@ -184,7 +180,6 @@ export class AgentSession< llm, tts, turnDetection, - interruptionDetector, userData, voiceOptions = defaultVoiceOptions, connOptions, @@ -220,7 +215,6 @@ export class AgentSession< } this.turnDetection = turnDetection; - this.interruptionDetector = interruptionDetector; this._userData = userData; // configurable IO @@ -687,8 +681,6 @@ export class AgentSession< return; } - const oldState = this._agentState; - if (state === 'speaking') { // Reset error counts when agent starts speaking this.llmErrorCounts = 0; @@ -704,25 +696,13 @@ export class AgentSession< // TODO(brian): PR4 - Set participant attributes if roomIO.room.localParticipant is available // (Ref: Python agent_session.py line 1161-1164) } - - // Notify AudioRecognition that agent started speaking (for interruption detection) - this.activity?.notifyAgentSpeechStarted(); - } else if (oldState === 'speaking') { - // Agent stopped speaking - if (this.agentSpeakingSpan !== undefined) { - // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available - this.agentSpeakingSpan.end(); - this.agentSpeakingSpan = undefined; - } - - // Notify AudioRecognition that agent stopped speaking (for interruption detection) - this.activity?.notifyAgentSpeechEnded(); } else if (this.agentSpeakingSpan !== undefined) { - // Non-speaking to non-speaking transition but span is still open + // TODO(brian): PR4 - Set ATTR_END_TIME attribute if available this.agentSpeakingSpan.end(); this.agentSpeakingSpan = undefined; } + const oldState = this._agentState; this._agentState = state; // Handle user away timer based on state changes diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 20f0ae6f8..25d430684 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -5,12 +5,6 @@ import { AudioFrame } from '@livekit/rtc-node'; import type { Context, Span } from '@opentelemetry/api'; import type { WritableStreamDefaultWriter } from 'node:stream/web'; import { ReadableStream } from 'node:stream/web'; -import type { AdaptiveInterruptionDetector } from '../inference/interruption/AdaptiveInterruptionDetector.js'; -import { - InterruptionStreamBase, - InterruptionStreamSentinel, -} from '../inference/interruption/InterruptionStream.js'; -import type { InterruptionEvent } from '../inference/interruption/interruption.js'; import { type ChatContext } from '../llm/chat_context.js'; import { log } from '../log.js'; import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js'; @@ -45,7 +39,6 @@ export interface RecognitionHooks { onFinalTranscript: (ev: SpeechEvent) => void; onEndOfTurn: (info: EndOfTurnInfo) => Promise; onPreemptiveGeneration: (info: PreemptiveGenerationInfo) => void; - onInterruption: (ev: InterruptionEvent) => void; retrieveChatCtx: () => ChatContext; } @@ -60,7 +53,6 @@ export interface AudioRecognitionOptions { recognitionHooks: RecognitionHooks; stt?: STTNode; vad?: VAD; - interruptionDetector?: AdaptiveInterruptionDetector; turnDetector?: _TurnDetector; turnDetectionMode?: Exclude; minEndpointingDelay: number; @@ -96,7 +88,6 @@ export class AudioRecognition { private vadInputStream: ReadableStream; private sttInputStream: ReadableStream; - private interruptionInputStream: ReadableStream; private silenceAudioTransform = new IdentityTransform(); private silenceAudioWriter: WritableStreamDefaultWriter; @@ -105,19 +96,11 @@ export class AudioRecognition { private commitUserTurnTask?: Task; private vadTask?: Task; private sttTask?: Task; - private interruptionTask?: Task; - - // interruption detection - private interruptionDetector?: AdaptiveInterruptionDetector; - private interruptionStream?: InterruptionStreamBase; - private interruptionEnabled = false; - private agentSpeaking = false; constructor(opts: AudioRecognitionOptions) { this.hooks = opts.recognitionHooks; this.stt = opts.stt; this.vad = opts.vad; - this.interruptionDetector = opts.interruptionDetector; this.turnDetector = opts.turnDetector; this.turnDetectionMode = opts.turnDetectionMode; this.minEndpointingDelay = opts.minEndpointingDelay; @@ -125,15 +108,10 @@ export class AudioRecognition { this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; - // Interruption detection is only enabled if both detector and VAD are provided - this.interruptionEnabled = this.interruptionDetector !== undefined && this.vad !== undefined; - this.deferredInputStream = new DeferredReadableStream(); - const [vadInputStream, rest] = this.deferredInputStream.stream.tee(); - const [sttInputStream, interruptionInputStream] = rest.tee(); + const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee(); this.vadInputStream = vadInputStream; this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable); - this.interruptionInputStream = interruptionInputStream; this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter(); } @@ -157,15 +135,6 @@ export class AudioRecognition { this.sttTask.result.catch((err) => { this.logger.error(`Error running STT task: ${err}`); }); - - if (this.interruptionEnabled && this.interruptionDetector) { - this.interruptionTask = Task.from(({ signal }) => - this.createInterruptionTask(this.interruptionDetector!, signal), - ); - this.interruptionTask.result.catch((err) => { - this.logger.error(`Error running interruption task: ${err}`); - }); - } } private async onSTTEvent(ev: SpeechEvent) { @@ -610,12 +579,6 @@ export class AudioRecognition { this.sampleRate = ev.frames[0].sampleRate; } - // If agent is speaking, user speech is overlap - trigger interruption detection - if (this.agentSpeaking) { - // TODO re-enable check for this.interruptionEnabled - this.onStartOfOverlapSpeech(ev.speechDuration, this.userTurnSpan); - } - this.bounceEOUTask?.cancel(); break; case VADEventType.INFERENCE_DONE: @@ -636,11 +599,6 @@ export class AudioRecognition { // when VAD fires END_OF_SPEECH, it already waited for the silence_duration this.speaking = false; - // If we were in overlap speech (agent speaking + user speaking), end it - if (this.agentSpeaking && this.interruptionEnabled) { - this.onEndOfOverlapSpeech(); - } - if ( this.vadBaseTurnDetection || (this.turnDetectionMode === 'stt' && this.userTurnCommitted) @@ -658,123 +616,6 @@ export class AudioRecognition { } } - private async createInterruptionTask( - interruptionDetector: AdaptiveInterruptionDetector, - signal: AbortSignal, - ) { - // Create the interruption stream from the detector - this.interruptionStream = interruptionDetector.createStream(); - - // Forward audio frames to the interruption stream - const reader = this.interruptionInputStream.getReader(); - - const forwardTask = (async () => { - try { - while (!signal.aborted) { - const { done, value: frame } = await reader.read(); - if (done) break; - await this.interruptionStream?.pushFrame(frame); - } - } catch (e) { - if (!signal.aborted) { - this.logger.error(e, 'Error forwarding audio to interruption stream'); - } - } finally { - reader.releaseLock(); - } - })(); - - // Read interruption events from the stream - const eventStream = this.interruptionStream.stream(); - const eventReader = eventStream.getReader(); - - const abortHandler = () => { - eventReader.releaseLock(); - this.interruptionStream?.close(); - signal.removeEventListener('abort', abortHandler); - }; - signal.addEventListener('abort', abortHandler); - - try { - while (!signal.aborted) { - const { done, value: ev } = await eventReader.read(); - if (done) break; - - this.logger.info({ type: ev.type, probability: ev.probability }, 'Interruption event'); - this.hooks.onInterruption(ev); - } - } catch (e) { - if (!signal.aborted) { - this.logger.error(e, 'Error in interruption task'); - } - } finally { - this.logger.debug('Interruption task closed'); - await forwardTask; - } - } - - /** - * Called when the agent starts speaking. - * Enables interruption detection by sending the agent-speech-started sentinel. - */ - onStartOfAgentSpeech(): void { - this.agentSpeaking = true; - - if (!this.interruptionEnabled || !this.interruptionStream) { - return; - } - - this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechStarted()); - } - - /** - * Called when the agent stops speaking. - * Disables interruption detection by sending the agent-speech-ended sentinel. - */ - onEndOfAgentSpeech(): void { - if (!this.interruptionEnabled || !this.interruptionStream) { - this.agentSpeaking = false; - return; - } - - this.interruptionStream.pushFrame(InterruptionStreamSentinel.speechEnded()); - - if (this.agentSpeaking) { - // No interruption was detected, end the overlap inference (idempotent) - this.onEndOfOverlapSpeech(); - } - - this.agentSpeaking = false; - } - - /** - * Called when user starts speaking while agent is speaking (overlap speech). - * This triggers the interruption detection inference. - */ - onStartOfOverlapSpeech(speechDuration: number, userSpeakingSpan?: Span): void { - if (!this.interruptionEnabled || !this.interruptionStream) { - return; - } - - if (this.agentSpeaking && userSpeakingSpan) { - this.interruptionStream.pushFrame( - InterruptionStreamSentinel.overlapSpeechStarted(speechDuration, userSpeakingSpan), - ); - } - } - - /** - * Called when user stops speaking during overlap. - * This ends the interruption detection inference for this overlap period. - */ - onEndOfOverlapSpeech(): void { - if (!this.interruptionEnabled || !this.interruptionStream) { - return; - } - - this.interruptionStream.pushFrame(InterruptionStreamSentinel.overlapSpeechEnded()); - } - setInputAudioStream(audioStream: ReadableStream) { this.deferredInputStream.setSource(audioStream); } @@ -847,8 +688,6 @@ export class AudioRecognition { await this.sttTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); - await this.interruptionTask?.cancelAndWait(); - await this.interruptionStream?.close(); } private _endUserTurnSpan({ From 94cd4c4a53435ea1da8d08d67d28cf0a0def32a0 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 10:43:45 +0100 Subject: [PATCH 17/25] reorganize --- .../AdaptiveInterruptionDetector.ts | 18 +--- .../interruption/InterruptionCacheEntry.ts | 44 +++++++++ .../interruption/InterruptionStream.ts | 65 +++++--------- agents/src/inference/interruption/defaults.ts | 2 +- agents/src/inference/interruption/errors.ts | 22 +++++ .../inference/interruption/http_transport.ts | 9 +- agents/src/inference/interruption/index.ts | 4 - agents/src/inference/interruption/types.ts | 86 ++++++++++++++++++ .../{ => interruption}/utils.test.ts | 0 .../{interruption.ts => utils.ts} | 90 +++---------------- .../inference/interruption/ws_transport.ts | 9 +- agents/src/inference/utils.ts | 15 ---- 12 files changed, 198 insertions(+), 166 deletions(-) create mode 100644 agents/src/inference/interruption/InterruptionCacheEntry.ts delete mode 100644 agents/src/inference/interruption/index.ts create mode 100644 agents/src/inference/interruption/types.ts rename agents/src/inference/{ => interruption}/utils.test.ts (100%) rename agents/src/inference/interruption/{interruption.ts => utils.ts} (55%) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index 0e137b15c..aa685dbe1 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -8,7 +8,8 @@ import { SAMPLE_RATE, interruptionOptionDefaults, } from './defaults.js'; -import { type InterruptionDetectionError, type InterruptionEvent } from './interruption.js'; +import type { InterruptionDetectionError } from './errors.js'; +import type { InterruptionEvent, InterruptionOptions } from './types.js'; type InterruptionCallbacks = { userInterruptionDetected: (event: InterruptionEvent) => void; @@ -17,21 +18,6 @@ type InterruptionCallbacks = { error: (error: InterruptionDetectionError) => void; }; -export interface InterruptionOptions { - sampleRate: number; - threshold: number; - minFrames: number; - maxAudioDurationInS: number; - audioPrefixDurationInS: number; - detectionIntervalInS: number; - inferenceTimeout: number; - minInterruptionDurationInS: number; - baseUrl: string; - apiKey: string; - apiSecret: string; - useProxy: boolean; -} - export type AdaptiveInterruptionDetectorOptions = Partial; export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { diff --git a/agents/src/inference/interruption/InterruptionCacheEntry.ts b/agents/src/inference/interruption/InterruptionCacheEntry.ts new file mode 100644 index 000000000..4f2f0a20b --- /dev/null +++ b/agents/src/inference/interruption/InterruptionCacheEntry.ts @@ -0,0 +1,44 @@ +import { estimateProbability } from './utils.js'; + +/** + * Typed cache entry for interruption inference results. + * Mutable to support setOrUpdate pattern from Python's _BoundedCache. + */ +export class InterruptionCacheEntry { + createdAt: number; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + speechInput?: Int16Array; + probabilities?: number[]; + isInterruption?: boolean; + + constructor(params: { + createdAt: number; + speechInput?: Int16Array; + totalDurationInS?: number; + predictionDurationInS?: number; + detectionDelayInS?: number; + probabilities?: number[]; + isInterruption?: boolean; + }) { + this.createdAt = params.createdAt; + this.totalDurationInS = params.totalDurationInS ?? 0; + this.predictionDurationInS = params.predictionDurationInS ?? 0; + this.detectionDelayInS = params.detectionDelayInS ?? 0; + this.speechInput = params.speechInput; + this.probabilities = params.probabilities; + this.isInterruption = params.isInterruption; + } + + /** + * The conservative estimated probability of the interruption event. + */ + get probability(): number { + return this.probabilities ? estimateProbability(this.probabilities) : 0; + } + + static default(): InterruptionCacheEntry { + return new InterruptionCacheEntry({ createdAt: 0 }); + } +} diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 7d0bd8142..d2206b787 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -4,49 +4,36 @@ import { type ReadableStream, TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; import { traceTypes } from '../../telemetry/index.js'; -import type { - AdaptiveInterruptionDetector, - InterruptionOptions, -} from './AdaptiveInterruptionDetector.js'; +import type { AdaptiveInterruptionDetector } from './AdaptiveInterruptionDetector.js'; +import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; import { FRAMES_PER_SECOND, apiConnectDefaults } from './defaults.js'; +import type { InterruptionDetectionError } from './errors.js'; import { createHttpTransport } from './http_transport.js'; import { - BoundedCache, - InterruptionCacheEntry, - type InterruptionDetectionError, + type AgentSpeechEnded, + type AgentSpeechStarted, + type ApiConnectOptions, + type Flush, type InterruptionEvent, InterruptionEventType, -} from './interruption.js'; + type InterruptionOptions, + type InterruptionSentinel, + type OverlapSpeechEnded, + type OverlapSpeechStarted, +} from './types.js'; +import { BoundedCache } from './utils.js'; import { createWsTransport } from './ws_transport.js'; -export interface AgentSpeechStarted { - type: 'agent-speech-started'; -} - -export interface AgentSpeechEnded { - type: 'agent-speech-ended'; -} - -export interface OverlapSpeechStarted { - type: 'overlap-speech-started'; - speechDurationInS: number; - userSpeakingSpan: Span; -} - -export interface OverlapSpeechEnded { - type: 'overlap-speech-ended'; -} - -export interface Flush { - type: 'flush'; -} - -export type InterruptionSentinel = - | AgentSpeechStarted - | AgentSpeechEnded - | OverlapSpeechStarted - | OverlapSpeechEnded - | Flush; +// Re-export sentinel types for backwards compatibility +export type { + AgentSpeechEnded, + AgentSpeechStarted, + ApiConnectOptions, + Flush, + InterruptionSentinel, + OverlapSpeechEnded, + OverlapSpeechStarted, +}; export class InterruptionStreamSentinel { static speechStarted(): AgentSpeechStarted { @@ -73,12 +60,6 @@ export class InterruptionStreamSentinel { } } -export interface ApiConnectOptions { - maxRetries: number; - retryInterval: number; - timeout: number; -} - function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { span.setAttribute( traceTypes.ATTR_IS_INTERRUPTION, diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 1a2beeb08..9dccf40c2 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -1,5 +1,5 @@ -import type { InterruptionOptions } from './AdaptiveInterruptionDetector.js'; import type { ApiConnectOptions } from './InterruptionStream.js'; +import type { InterruptionOptions } from './types.js'; export const MIN_INTERRUPTION_DURATION_IN_S = 0.025 * 2; // 25ms per frame, 2 consecutive frames export const THRESHOLD = 0.65; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts index e69de29bb..ba95a9674 100644 --- a/agents/src/inference/interruption/errors.ts +++ b/agents/src/inference/interruption/errors.ts @@ -0,0 +1,22 @@ +/** + * Error thrown during interruption detection. + */ +export class InterruptionDetectionError extends Error { + readonly type = 'InterruptionDetectionError'; + + readonly timestamp: number; + readonly label: string; + readonly recoverable: boolean; + + constructor(message: string, timestamp: number, label: string, recoverable: boolean) { + super(message); + this.name = 'InterruptionDetectionError'; + this.timestamp = timestamp; + this.label = label; + this.recoverable = recoverable; + } + + toString(): string { + return `${this.name}: ${this.message} (label=${this.label}, timestamp=${this.timestamp}, recoverable=${this.recoverable})`; + } +} diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index b2419a514..96dfc03eb 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -2,13 +2,10 @@ import { ofetch } from 'ofetch'; import { TransformStream } from 'stream/web'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; +import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; import { intervalForRetry } from './defaults.js'; -import { - BoundedCache, - InterruptionCacheEntry, - type InterruptionEvent, - InterruptionEventType, -} from './interruption.js'; +import { type InterruptionEvent, InterruptionEventType } from './types.js'; +import type { BoundedCache } from './utils.js'; export interface PostOptions { baseUrl: string; diff --git a/agents/src/inference/interruption/index.ts b/agents/src/inference/interruption/index.ts deleted file mode 100644 index 0d0bc4c4a..000000000 --- a/agents/src/inference/interruption/index.ts +++ /dev/null @@ -1,4 +0,0 @@ -export * from './AdaptiveInterruptionDetector.js'; -export * from './interruption.js'; -export { InterruptionStreamSentinel } from './InterruptionStream.js'; -export type { InterruptionSentinel } from './InterruptionStream.js'; diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts new file mode 100644 index 000000000..cf13d4d2d --- /dev/null +++ b/agents/src/inference/interruption/types.ts @@ -0,0 +1,86 @@ +import type { Span } from '@opentelemetry/api'; + +/** + * Event types for interruption detection. + */ +export enum InterruptionEventType { + INTERRUPTION = 'interruption', + OVERLAP_SPEECH_ENDED = 'overlap_speech_ended', +} + +/** + * Event emitted when an interruption is detected or overlap speech ends. + */ +export interface InterruptionEvent { + type: InterruptionEventType; + timestamp: number; + isInterruption: boolean; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + overlapSpeechStartedAt?: number; + speechInput?: Int16Array; + probabilities?: number[]; + probability: number; +} + +/** + * Configuration options for interruption detection. + */ +export interface InterruptionOptions { + sampleRate: number; + threshold: number; + minFrames: number; + maxAudioDurationInS: number; + audioPrefixDurationInS: number; + detectionIntervalInS: number; + inferenceTimeout: number; + minInterruptionDurationInS: number; + baseUrl: string; + apiKey: string; + apiSecret: string; + useProxy: boolean; +} + +/** + * API connection options for transport layers. + */ +export interface ApiConnectOptions { + maxRetries: number; + retryInterval: number; + timeout: number; +} + +// Sentinel types for stream control signals + +export interface AgentSpeechStarted { + type: 'agent-speech-started'; +} + +export interface AgentSpeechEnded { + type: 'agent-speech-ended'; +} + +export interface OverlapSpeechStarted { + type: 'overlap-speech-started'; + speechDurationInS: number; + userSpeakingSpan: Span; +} + +export interface OverlapSpeechEnded { + type: 'overlap-speech-ended'; +} + +export interface Flush { + type: 'flush'; +} + +/** + * Union type for all stream control signals. + */ +export type InterruptionSentinel = + | AgentSpeechStarted + | AgentSpeechEnded + | OverlapSpeechStarted + | OverlapSpeechEnded + | Flush; diff --git a/agents/src/inference/utils.test.ts b/agents/src/inference/interruption/utils.test.ts similarity index 100% rename from agents/src/inference/utils.test.ts rename to agents/src/inference/interruption/utils.test.ts diff --git a/agents/src/inference/interruption/interruption.ts b/agents/src/inference/interruption/utils.ts similarity index 55% rename from agents/src/inference/interruption/interruption.ts rename to agents/src/inference/interruption/utils.ts index 5c17ff534..161e08bb1 100644 --- a/agents/src/inference/interruption/interruption.ts +++ b/agents/src/inference/interruption/utils.ts @@ -1,4 +1,3 @@ -import { slidingWindowMinMax } from '../utils.js'; import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; /** @@ -106,44 +105,11 @@ export class BoundedCache { } } -export enum InterruptionEventType { - INTERRUPTION = 'interruption', - OVERLAP_SPEECH_ENDED = 'overlap_speech_ended', -} -export interface InterruptionEvent { - type: InterruptionEventType; - timestamp: number; - isInterruption: boolean; - totalDurationInS: number; - predictionDurationInS: number; - detectionDelayInS: number; - overlapSpeechStartedAt?: number; - speechInput?: Int16Array; - probabilities?: number[]; - probability: number; -} - -export class InterruptionDetectionError extends Error { - readonly type = 'InterruptionDetectionError'; - - readonly timestamp: number; - readonly label: string; - readonly recoverable: boolean; - - constructor(message: string, timestamp: number, label: string, recoverable: boolean) { - super(message); - this.name = 'InterruptionDetectionError'; - this.timestamp = timestamp; - this.label = label; - this.recoverable = recoverable; - } - - toString(): string { - return `${this.name}: ${this.message} (label=${this.label}, timestamp=${this.timestamp}, recoverable=${this.recoverable})`; - } -} - -function estimateProbability( +/** + * Estimate probability using sliding window min-max algorithm. + * Returns a conservative estimate based on the minimum window size. + */ +export function estimateProbability( probabilities: number[], windowSizeInS: number = MIN_INTERRUPTION_DURATION_IN_S, ): number { @@ -155,45 +121,17 @@ function estimateProbability( return slidingWindowMinMax(probabilities, minWindow); } -/** - * Typed cache entry for interruption inference results. - * Mutable to support setOrUpdate pattern from Python's _BoundedCache. - */ -export class InterruptionCacheEntry { - createdAt: number; - totalDurationInS: number; - predictionDurationInS: number; - detectionDelayInS: number; - speechInput?: Int16Array; - probabilities?: number[]; - isInterruption?: boolean; - - constructor(params: { - createdAt: number; - speechInput?: Int16Array; - totalDurationInS?: number; - predictionDurationInS?: number; - detectionDelayInS?: number; - probabilities?: number[]; - isInterruption?: boolean; - }) { - this.createdAt = params.createdAt; - this.totalDurationInS = params.totalDurationInS ?? 0; - this.predictionDurationInS = params.predictionDurationInS ?? 0; - this.detectionDelayInS = params.detectionDelayInS ?? 0; - this.speechInput = params.speechInput; - this.probabilities = params.probabilities; - this.isInterruption = params.isInterruption; +export function slidingWindowMinMax(probabilities: number[], minWindow: number): number { + if (probabilities.length < minWindow) { + return -Infinity; } - /** - * The conservative estimated probability of the interruption event. - */ - get probability(): number { - return this.probabilities ? estimateProbability(this.probabilities) : 0; - } + let maxOfMins = -Infinity; - static default(): InterruptionCacheEntry { - return new InterruptionCacheEntry({ createdAt: 0 }); + for (let i = 0; i <= probabilities.length - minWindow; i++) { + const windowMin = Math.min(...probabilities.slice(i, i + minWindow)); + maxOfMins = Math.max(maxOfMins, windowMin); } + + return maxOfMins; } diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 3d76ff205..8f6409f02 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -3,13 +3,10 @@ import { TransformStream } from 'stream/web'; import WebSocket, { createWebSocketStream } from 'ws'; import { log } from '../../log.js'; import { createAccessToken } from '../utils.js'; +import { InterruptionCacheEntry } from './InterruptionCacheEntry.js'; import { intervalForRetry } from './defaults.js'; -import { - type BoundedCache, - InterruptionCacheEntry, - type InterruptionEvent, - InterruptionEventType, -} from './interruption.js'; +import { type InterruptionEvent, InterruptionEventType } from './types.js'; +import type { BoundedCache } from './utils.js'; // WebSocket message types const MSG_SESSION_CREATE = 'session.create'; diff --git a/agents/src/inference/utils.ts b/agents/src/inference/utils.ts index e898d4de1..b3b772ef6 100644 --- a/agents/src/inference/utils.ts +++ b/agents/src/inference/utils.ts @@ -64,18 +64,3 @@ export async function connectWs( socket.once('close', onClose); }); } - -export function slidingWindowMinMax(probabilities: number[], minWindow: number): number { - if (probabilities.length < minWindow) { - return -Infinity; - } - - let maxOfMins = -Infinity; - - for (let i = 0; i <= probabilities.length - minWindow; i++) { - const windowMin = Math.min(...probabilities.slice(i, i + minWindow)); - maxOfMins = Math.max(maxOfMins, windowMin); - } - - return maxOfMins; -} From 776cb300b70698d7f43bebef2d2ded01ba34723b Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 10:46:06 +0100 Subject: [PATCH 18/25] revert test changes --- .changeset/config.json | 1 + .changeset/shiny-eels-throw.md | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) delete mode 100644 .changeset/shiny-eels-throw.md diff --git a/.changeset/config.json b/.changeset/config.json index 6e26590ab..29b38eb85 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -8,6 +8,7 @@ ], "commit": false, "ignore": ["livekit-agents-examples"], + "fixed": [["@livekit/agents", "@livekit/agents-plugin-*", "@livekit/agents-plugins-test"]], "access": "public", "baseBranch": "main", "updateInternalDependencies": "patch", diff --git a/.changeset/shiny-eels-throw.md b/.changeset/shiny-eels-throw.md deleted file mode 100644 index df3e21f67..000000000 --- a/.changeset/shiny-eels-throw.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -'@livekit/agents': patch ---- - -barge in From a580d7e020e4ca75a1e364865589bed30790c4aa Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 10:48:34 +0100 Subject: [PATCH 19/25] remove broken example --- examples/src/adaptive_interruption.ts | 109 -------------------------- 1 file changed, 109 deletions(-) delete mode 100644 examples/src/adaptive_interruption.ts diff --git a/examples/src/adaptive_interruption.ts b/examples/src/adaptive_interruption.ts deleted file mode 100644 index 6e6700f58..000000000 --- a/examples/src/adaptive_interruption.ts +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-FileCopyrightText: 2025 LiveKit, Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -/** - * This example demonstrates how to use the AdaptiveInterruptionDetector - * for detecting user interruptions during agent speech. - * - * The detector analyzes overlapping speech (when user speaks while agent is speaking) - * and determines whether the user intends to interrupt or is just providing backchannel - * feedback (like "uh-huh", "okay", etc). - * - * The interruption detection is integrated into AudioRecognition and works automatically - * when the detector is provided along with VAD. It: - * 1. Forwards audio frames to the detector when the agent is speaking - * 2. Triggers overlap detection when VAD detects user speech during agent speech - * 3. Emits interruption events that can be handled to stop/pause agent speech - */ -import { - AdaptiveInterruptionDetector, - type JobContext, - type JobProcess, - WorkerOptions, - cli, - defineAgent, - log, - voice, -} from '@livekit/agents'; -import * as silero from '@livekit/agents-plugin-silero'; -import { fileURLToPath } from 'node:url'; - -export default defineAgent({ - prewarm: async (proc: JobProcess) => { - proc.userData.vad = await silero.VAD.load(); - }, - entry: async (ctx: JobContext) => { - const logger = log(); - const vad = ctx.proc.userData.vad as silero.VAD; - - await ctx.connect(); - - // Create the adaptive interruption detector with custom options - const interruptionDetector = new AdaptiveInterruptionDetector({ - // Threshold for interruption classification (0-1) - // Higher = less sensitive, lower = more sensitive - threshold: 0.65, - // Minimum duration of overlap speech to consider as potential interruption - minInterruptionDuration: 0.05, - // Maximum audio duration to analyze (including prefix) - maxAudioDuration: 3.0, - // Audio context to include before overlap started - audioPrefixDuration: 0.5, - // How often to run inference during overlap - detectionInterval: 0.1, - }); - - // Listen for interruption events on the detector (optional - for logging/metrics) - interruptionDetector.on('interruptionDetected', () => { - logger.info('Interruption detected via detector event'); - }); - - interruptionDetector.on('overlapSpeechDetected', () => { - logger.info('Overlap speech ended without interruption (backchannel)'); - }); - - // Create the agent - const agent = new voice.Agent({ - instructions: `You are a helpful assistant that demonstrates interruption detection. - Speak naturally and respond to the user. When you are interrupted, - you will stop speaking and listen to the user.`, - }); - - // Create the session with interruption detection enabled - // The detector is passed to AgentSession which wires it through to AudioRecognition - const session = new voice.AgentSession({ - llm: 'openai/gpt-4.1-mini', - stt: 'deepgram/nova-3', - tts: 'cartesia/sonic-2:c45bc5ec-dc68-4feb-8829-6e6b2748095d', - vad, - // Pass the interruption detector - interruptionDetector, - voiceOptions: { - allowInterruptions: false, - }, - }); - - // Start the session - await session.start({ - agent, - room: ctx.room, - }); - - // // Example: Dynamically adjust threshold based on context - // // This could be useful to adapt to different conversation styles - // setTimeout(() => { - // logger.info('Adjusting interruption threshold for more sensitive detection'); - // interruptionDetector.updateOptions({ - // threshold: 0.5, // More sensitive to interruptions - // minInterruptionDuration: 0.03, // Detect shorter interruptions - // }); - // }, 30000); - - session.say( - 'Hello! I can detect when you want to interrupt me versus when you are just saying things like uh-huh or okay. Try talking while I am speaking to see how it works!', - ); - }, -}); - -cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); From dd0c98ac460ab4924453a0a85a99841521a91fe0 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 11:03:32 +0100 Subject: [PATCH 20/25] fix mutable transport options --- .../AdaptiveInterruptionDetector.ts | 2 +- .../interruption/InterruptionStream.ts | 32 +++++++++++++++---- .../inference/interruption/http_transport.ts | 8 ++++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index aa685dbe1..a473ec985 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -18,7 +18,7 @@ type InterruptionCallbacks = { error: (error: InterruptionDetectionError) => void; }; -export type AdaptiveInterruptionDetectorOptions = Partial; +export type AdaptiveInterruptionDetectorOptions = Omit, 'useProxy'>; export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { options: InterruptionOptions; diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index d2206b787..75d18729a 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -93,6 +93,12 @@ export class InterruptionStreamBase { // Store reconnect function for WebSocket transport private wsReconnect?: () => Promise; + // Mutable transport options that can be updated via updateOptions() + private transportOptions: { + threshold: number; + minFrames: number; + }; + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { this.inputStream = createStreamChannel< InterruptionSentinel | AudioFrame, @@ -103,6 +109,12 @@ export class InterruptionStreamBase { this.options = { ...model.options }; this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; + // Initialize mutable transport options + this.transportOptions = { + threshold: this.options.threshold, + minFrames: this.options.minFrames, + }; + this.eventStream = this.setupTransform(); } @@ -115,10 +127,12 @@ export class InterruptionStreamBase { }): Promise { if (options.threshold !== undefined) { this.options.threshold = options.threshold; + this.transportOptions.threshold = options.threshold; } if (options.minInterruptionDurationInS !== undefined) { this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + this.transportOptions.minFrames = this.options.minFrames; } // Trigger WebSocket reconnection if using proxy (WebSocket transport) if (this.options.useProxy && this.wsReconnect) { @@ -259,24 +273,30 @@ export class InterruptionStreamBase { ); // Second transform: transport layer (HTTP or WebSocket based on useProxy) - const transportOptions = { + // Use a getter for threshold/minFrames so HTTP transport picks up updated values + const getTransportOptions = () => ({ baseUrl: this.options.baseUrl, apiKey: this.options.apiKey, apiSecret: this.options.apiSecret, sampleRate: this.options.sampleRate, - threshold: this.options.threshold, - minFrames: this.options.minFrames, + threshold: this.transportOptions.threshold, + minFrames: this.transportOptions.minFrames, timeout: this.options.inferenceTimeout, maxRetries: this.apiOptions.maxRetries, - }; + }); let transport: TransformStream; if (this.options.useProxy) { - const wsResult = createWsTransport(transportOptions, getState, setState, handleSpanUpdate); + const wsResult = createWsTransport( + getTransportOptions(), + getState, + setState, + handleSpanUpdate, + ); transport = wsResult.transport; this.wsReconnect = wsResult.reconnect; } else { - transport = createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); + transport = createHttpTransport(getTransportOptions, getState, setState, handleSpanUpdate); } // Pipeline: input -> audioTransformer -> transport -> eventStream diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 96dfc03eb..41832878d 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -94,9 +94,12 @@ export interface HttpTransportState { * * This transport receives Int16Array audio slices and outputs InterruptionEvents. * Each audio slice triggers an HTTP POST request. + * + * @param getOptions - Getter function that returns current transport options. + * This allows options like threshold/minFrames to be updated dynamically. */ export function createHttpTransport( - options: HttpTransportOptions, + getOptions: () => HttpTransportOptions, getState: () => HttpTransportState, setState: (partial: Partial) => void, updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, @@ -115,6 +118,9 @@ export function createHttpTransport( const state = getState(); if (!state.overlapSpeechStartedAt) return; + // Get current options on each request to pick up any updates + const options = getOptions(); + try { const resp = await predictHTTP( chunk, From 00c16ffde82cf35baa0e958023ab4b3373ce7986 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 11:05:42 +0100 Subject: [PATCH 21/25] async fixes --- .../interruption/InterruptionStream.ts | 35 +++++++++---------- .../inference/interruption/http_transport.ts | 9 ++--- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 75d18729a..7be72ef5f 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -95,8 +95,14 @@ export class InterruptionStreamBase { // Mutable transport options that can be updated via updateOptions() private transportOptions: { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; threshold: number; minFrames: number; + timeout: number; + maxRetries: number; }; constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { @@ -111,8 +117,14 @@ export class InterruptionStreamBase { // Initialize mutable transport options this.transportOptions = { + baseUrl: this.options.baseUrl, + apiKey: this.options.apiKey, + apiSecret: this.options.apiSecret, + sampleRate: this.options.sampleRate, threshold: this.options.threshold, minFrames: this.options.minFrames, + timeout: this.options.inferenceTimeout, + maxRetries: this.apiOptions.maxRetries, }; this.eventStream = this.setupTransform(); @@ -273,30 +285,15 @@ export class InterruptionStreamBase { ); // Second transform: transport layer (HTTP or WebSocket based on useProxy) - // Use a getter for threshold/minFrames so HTTP transport picks up updated values - const getTransportOptions = () => ({ - baseUrl: this.options.baseUrl, - apiKey: this.options.apiKey, - apiSecret: this.options.apiSecret, - sampleRate: this.options.sampleRate, - threshold: this.transportOptions.threshold, - minFrames: this.transportOptions.minFrames, - timeout: this.options.inferenceTimeout, - maxRetries: this.apiOptions.maxRetries, - }); + const transportOptions = this.transportOptions; let transport: TransformStream; if (this.options.useProxy) { - const wsResult = createWsTransport( - getTransportOptions(), - getState, - setState, - handleSpanUpdate, - ); + const wsResult = createWsTransport(transportOptions, getState, setState, handleSpanUpdate); transport = wsResult.transport; this.wsReconnect = wsResult.reconnect; } else { - transport = createHttpTransport(getTransportOptions, getState, setState, handleSpanUpdate); + transport = createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); } // Pipeline: input -> audioTransformer -> transport -> eventStream @@ -346,7 +343,7 @@ export class InterruptionStreamBase { async flush(): Promise { this.ensureStreamsNotEnded(); - this.inputStream.write(InterruptionStreamSentinel.flush()); + await this.inputStream.write(InterruptionStreamSentinel.flush()); } async endInput(): Promise { diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index 41832878d..c26107689 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -95,11 +95,11 @@ export interface HttpTransportState { * This transport receives Int16Array audio slices and outputs InterruptionEvents. * Each audio slice triggers an HTTP POST request. * - * @param getOptions - Getter function that returns current transport options. - * This allows options like threshold/minFrames to be updated dynamically. + * @param options - Transport options object. This is read on each request, so mutations + * to threshold/minFrames will be picked up dynamically. */ export function createHttpTransport( - getOptions: () => HttpTransportOptions, + options: HttpTransportOptions, getState: () => HttpTransportState, setState: (partial: Partial) => void, updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, @@ -118,9 +118,6 @@ export function createHttpTransport( const state = getState(); if (!state.overlapSpeechStartedAt) return; - // Get current options on each request to pick up any updates - const options = getOptions(); - try { const resp = await predictHTTP( chunk, From 7dd5bbed15e93e24298d812c04c2fd33928bb5e6 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 11:08:12 +0100 Subject: [PATCH 22/25] terminate ws on timeout --- agents/src/inference/interruption/ws_transport.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 8f6409f02..93d7c568b 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -75,16 +75,17 @@ async function connectWebSocket(options: WsTransportOptions): Promise<{ const { readable, writable } = webSocketToStream(ws); await new Promise((resolve, reject) => { - const timeout = setTimeout( - () => reject(new Error('WebSocket connection timeout')), - options.timeout, - ); + const timeout = setTimeout(() => { + ws.terminate(); + reject(new Error('WebSocket connection timeout')); + }, options.timeout); ws.once('open', () => { clearTimeout(timeout); resolve(); }); ws.once('error', (err) => { clearTimeout(timeout); + ws.terminate(); reject(err); }); }); From 543742db029af52f135b891b14de78e80efa3fcd Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 11:11:51 +0100 Subject: [PATCH 23/25] more fixes --- agents/src/inference/interruption/ws_transport.ts | 8 +------- agents/src/stream/stream_channel.ts | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts index 93d7c568b..663a9b08e 100644 --- a/agents/src/inference/interruption/ws_transport.ts +++ b/agents/src/inference/interruption/ws_transport.ts @@ -183,13 +183,7 @@ export function createWsTransport( const message: WsMessage = JSON.parse(line); handleMessage(message); } catch { - // Try parsing the whole buffer as a single message - try { - const message: WsMessage = JSON.parse(line); - handleMessage(message); - } catch { - logger.warn({ line }, 'Failed to parse WebSocket message'); - } + logger.warn({ line }, 'Failed to parse WebSocket message'); } } } diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 546cf93ff..75fcfd6c7 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -21,6 +21,7 @@ export function createStreamChannel(): StreamChannel write: (chunk: T) => writer.write(chunk), stream: () => transform.readable, abort: (error: E) => { + isClosed = true; return writer.abort(error); }, close: async () => { From 9f6132628153a05fdb9c395916e38c8f3625e9b3 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 11:19:52 +0100 Subject: [PATCH 24/25] add license headers --- .../src/inference/interruption/AdaptiveInterruptionDetector.ts | 3 +++ agents/src/inference/interruption/InterruptionCacheEntry.ts | 3 +++ agents/src/inference/interruption/InterruptionStream.ts | 3 +++ agents/src/inference/interruption/defaults.ts | 3 +++ agents/src/inference/interruption/errors.ts | 3 +++ agents/src/inference/interruption/http_transport.ts | 3 +++ agents/src/inference/interruption/types.ts | 3 +++ agents/src/inference/interruption/utils.test.ts | 2 +- agents/src/inference/interruption/utils.ts | 3 +++ 9 files changed, 25 insertions(+), 1 deletion(-) diff --git a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts index a473ec985..eb27a2482 100644 --- a/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts +++ b/agents/src/inference/interruption/AdaptiveInterruptionDetector.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import type { TypedEventEmitter } from '@livekit/typed-emitter'; import EventEmitter from 'events'; import { log } from '../../log.js'; diff --git a/agents/src/inference/interruption/InterruptionCacheEntry.ts b/agents/src/inference/interruption/InterruptionCacheEntry.ts index 4f2f0a20b..e6da964d8 100644 --- a/agents/src/inference/interruption/InterruptionCacheEntry.ts +++ b/agents/src/inference/interruption/InterruptionCacheEntry.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import { estimateProbability } from './utils.js'; /** diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 7be72ef5f..95565d446 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; import type { Span } from '@opentelemetry/api'; import { type ReadableStream, TransformStream } from 'stream/web'; diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts index 9dccf40c2..cd7988f6a 100644 --- a/agents/src/inference/interruption/defaults.ts +++ b/agents/src/inference/interruption/defaults.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import type { ApiConnectOptions } from './InterruptionStream.js'; import type { InterruptionOptions } from './types.js'; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts index ba95a9674..a346b7d28 100644 --- a/agents/src/inference/interruption/errors.ts +++ b/agents/src/inference/interruption/errors.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 /** * Error thrown during interruption detection. */ diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts index c26107689..25f8b7c25 100644 --- a/agents/src/inference/interruption/http_transport.ts +++ b/agents/src/inference/interruption/http_transport.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import { ofetch } from 'ofetch'; import { TransformStream } from 'stream/web'; import { log } from '../../log.js'; diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts index cf13d4d2d..f6f083f38 100644 --- a/agents/src/inference/interruption/types.ts +++ b/agents/src/inference/interruption/types.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import type { Span } from '@opentelemetry/api'; /** diff --git a/agents/src/inference/interruption/utils.test.ts b/agents/src/inference/interruption/utils.test.ts index bcd2fe9a8..762bc5ea3 100644 --- a/agents/src/inference/interruption/utils.test.ts +++ b/agents/src/inference/interruption/utils.test.ts @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from 'vitest'; diff --git a/agents/src/inference/interruption/utils.ts b/agents/src/inference/interruption/utils.ts index 161e08bb1..0c5a4bf40 100644 --- a/agents/src/inference/interruption/utils.ts +++ b/agents/src/inference/interruption/utils.ts @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; /** From d32c2029593f7a7467761aaddafe43c4c8e81654 Mon Sep 17 00:00:00 2001 From: lukasIO Date: Thu, 22 Jan 2026 11:40:34 +0100 Subject: [PATCH 25/25] emit events on detector --- .../interruption/InterruptionStream.ts | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/agents/src/inference/interruption/InterruptionStream.ts b/agents/src/inference/interruption/InterruptionStream.ts index 95565d446..bdd9b178c 100644 --- a/agents/src/inference/interruption/InterruptionStream.ts +++ b/agents/src/inference/interruption/InterruptionStream.ts @@ -299,8 +299,23 @@ export class InterruptionStreamBase { transport = createHttpTransport(transportOptions, getState, setState, handleSpanUpdate); } + const eventEmitter = new TransformStream({ + transform: (chunk, controller) => { + if (chunk.type === InterruptionEventType.INTERRUPTION) { + this.model.emit('userInterruptionDetected', chunk); + } else if (chunk.type === InterruptionEventType.OVERLAP_SPEECH_ENDED) { + this.model.emit('overlapSpeechEnded', chunk); + } + controller.enqueue(chunk); + }, + }); + // Pipeline: input -> audioTransformer -> transport -> eventStream - return this.inputStream.stream().pipeThrough(audioTransformer).pipeThrough(transport); + return this.inputStream + .stream() + .pipeThrough(audioTransformer) + .pipeThrough(transport) + .pipeThrough(eventEmitter); } private ensureInputNotEnded() {