diff --git a/.changeset/config.json b/.changeset/config.json index af66336b2..29b38eb85 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -8,13 +8,7 @@ ], "commit": false, "ignore": ["livekit-agents-examples"], - "fixed": [ - [ - "@livekit/agents", - "@livekit/agents-plugin-*", - "@livekit/agents-plugins-test" - ] - ], + "fixed": [["@livekit/agents", "@livekit/agents-plugin-*", "@livekit/agents-plugins-test"]], "access": "public", "baseBranch": "main", "updateInternalDependencies": "patch", diff --git a/.changeset/metal-teeth-buy.md b/.changeset/metal-teeth-buy.md new file mode 100644 index 000000000..ef1520c3e --- /dev/null +++ b/.changeset/metal-teeth-buy.md @@ -0,0 +1,6 @@ +--- +'@livekit/agents': minor +--- + +- Add adaptive interruption handling +- Add remote session event handler diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5a577688..b4472c81b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -46,11 +46,11 @@ jobs: - name: Test agents if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name == 'push' run: pnpm test agents - - name: Test examples - if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') && secrets.OPENAI_API_KEY != '' - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: pnpm test:examples + # - name: Test examples + # if: (steps.filter.outputs.examples == 'true' || github.event_name == 'push') + # env: + # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + # run: pnpm test:examples # TODO (AJS-83) Re-enable once plugins are refactored with abort controllers # - name: Test all plugins # if: steps.filter.outputs.agents-or-tests == 'true' || github.event_name != 'pull_request' diff --git a/agents/package.json b/agents/package.json index ebf94a7bb..c1bf3e5e7 100644 --- a/agents/package.json +++ b/agents/package.json @@ -46,9 +46,10 @@ "zod": "^3.25.76" }, "dependencies": { + "@bufbuild/protobuf": "^1.10.0", "@ffmpeg-installer/ffmpeg": "^1.1.0", "@livekit/mutex": "^1.1.1", - "@livekit/protocol": "^1.43.0", + "@livekit/protocol": "^1.45.1", "@livekit/typed-emitter": "^3.0.0", "@opentelemetry/api": "^1.9.0", "@opentelemetry/api-logs": "^0.54.0", @@ -69,6 +70,7 @@ "heap-js": "^2.6.0", "json-schema": "^0.4.0", "livekit-server-sdk": "^2.14.1", + "ofetch": "^1.5.1", "openai": "^6.8.1", "pidusage": "^4.0.1", "pino": "^8.19.0", diff --git a/agents/src/cli.ts b/agents/src/cli.ts index 1e53c16c0..058456e82 100644 --- a/agents/src/cli.ts +++ b/agents/src/cli.ts @@ -148,6 +148,7 @@ export const runApp = (opts: ServerOptions) => { opts.apiSecret = globalOptions.apiSecret || opts.apiSecret; opts.logLevel = commandOptions.logLevel; opts.workerToken = globalOptions.workerToken || opts.workerToken; + process.env.LIVEKIT_DEV_MODE = '1'; runServer({ opts, production: false, @@ -169,6 +170,7 @@ export const runApp = (opts: ServerOptions) => { opts.apiSecret = globalOptions.apiSecret || opts.apiSecret; opts.logLevel = commandOptions.logLevel; opts.workerToken = globalOptions.workerToken || opts.workerToken; + process.env.LIVEKIT_DEV_MODE = '1'; runServer({ opts, production: false, diff --git a/agents/src/constants.ts b/agents/src/constants.ts index 86ead5b4c..3da61af46 100644 --- a/agents/src/constants.ts +++ b/agents/src/constants.ts @@ -7,3 +7,17 @@ export const TOPIC_TRANSCRIPTION = 'lk.transcription'; export const ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID = 'lk.segment_id'; export const ATTRIBUTE_PUBLISH_ON_BEHALF = 'lk.publish_on_behalf'; export const TOPIC_CHAT = 'lk.chat'; + +export const ATTRIBUTE_AGENT_STATE = 'lk.agent.state'; +export const ATTRIBUTE_AGENT_NAME = 'lk.agent.name'; + +// TODO(eval): export const ATTRIBUTE_SIMULATOR = 'lk.simulator'; + +export const TOPIC_CLIENT_EVENTS = 'lk.agent.events'; +export const RPC_GET_SESSION_STATE = 'lk.agent.get_session_state'; +export const RPC_GET_CHAT_HISTORY = 'lk.agent.get_chat_history'; +export const RPC_GET_AGENT_INFO = 'lk.agent.get_agent_info'; +export const RPC_SEND_MESSAGE = 'lk.agent.send_message'; +export const TOPIC_AGENT_REQUEST = 'lk.agent.request'; +export const TOPIC_AGENT_RESPONSE = 'lk.agent.response'; +export const TOPIC_SESSION_MESSAGES = 'lk.agent.session'; diff --git a/agents/src/inference/interruption/defaults.ts b/agents/src/inference/interruption/defaults.ts new file mode 100644 index 000000000..01d7b9290 --- /dev/null +++ b/agents/src/inference/interruption/defaults.ts @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { ApiConnectOptions } from './interruption_stream.js'; +import type { InterruptionOptions } from './types.js'; + +export const MIN_INTERRUPTION_DURATION_IN_S = 0.025 * 2; // 25ms per frame, 2 consecutive frames +export const THRESHOLD = 0.5; +export const MAX_AUDIO_DURATION_IN_S = 3.0; +export const AUDIO_PREFIX_DURATION_IN_S = 0.5; +export const DETECTION_INTERVAL_IN_S = 0.1; +export const REMOTE_INFERENCE_TIMEOUT_IN_S = 0.7; +export const SAMPLE_RATE = 16000; +export const FRAMES_PER_SECOND = 40; +export const FRAME_DURATION_IN_S = 0.025; // 25ms per frame + +export const apiConnectDefaults: ApiConnectOptions = { + maxRetries: 3, + retryInterval: 2_000, + timeout: 10_000, +} as const; + +/** + * Calculate the retry interval using exponential backoff with jitter. + * Matches the Python implementation's _interval_for_retry behavior. + */ +export function intervalForRetry( + attempt: number, + baseInterval: number = apiConnectDefaults.retryInterval, +): number { + // Exponential backoff: baseInterval * 2^attempt with some jitter + const exponentialDelay = baseInterval * Math.pow(2, attempt); + // Add jitter (0-25% of the delay) + const jitter = exponentialDelay * Math.random() * 0.25; + return exponentialDelay + jitter; +} + +// baseUrl and useProxy are resolved dynamically in the constructor +// to respect LIVEKIT_REMOTE_EOT_URL environment variable +export const interruptionOptionDefaults: Omit = { + sampleRate: SAMPLE_RATE, + threshold: THRESHOLD, + minFrames: Math.ceil(MIN_INTERRUPTION_DURATION_IN_S * FRAMES_PER_SECOND), + maxAudioDurationInS: MAX_AUDIO_DURATION_IN_S, + audioPrefixDurationInS: AUDIO_PREFIX_DURATION_IN_S, + detectionIntervalInS: DETECTION_INTERVAL_IN_S, + inferenceTimeout: REMOTE_INFERENCE_TIMEOUT_IN_S * 1_000, + apiKey: process.env.LIVEKIT_API_KEY || '', + apiSecret: process.env.LIVEKIT_API_SECRET || '', + minInterruptionDurationInS: MIN_INTERRUPTION_DURATION_IN_S, +} as const; diff --git a/agents/src/inference/interruption/errors.ts b/agents/src/inference/interruption/errors.ts new file mode 100644 index 000000000..5b5f6d370 --- /dev/null +++ b/agents/src/inference/interruption/errors.ts @@ -0,0 +1,25 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +/** + * Error thrown during interruption detection. + */ +export class InterruptionDetectionError extends Error { + readonly type = 'interruption_detection_error' as const; + + readonly timestamp: number; + readonly label: string; + readonly recoverable: boolean; + + constructor(message: string, timestamp: number, label: string, recoverable: boolean) { + super(message); + this.name = 'InterruptionDetectionError'; + this.timestamp = timestamp; + this.label = label; + this.recoverable = recoverable; + } + + toString(): string { + return `${this.name}: ${this.message} (label=${this.label}, timestamp=${this.timestamp}, recoverable=${this.recoverable})`; + } +} diff --git a/agents/src/inference/interruption/http_transport.ts b/agents/src/inference/interruption/http_transport.ts new file mode 100644 index 000000000..6d4cafbf5 --- /dev/null +++ b/agents/src/inference/interruption/http_transport.ts @@ -0,0 +1,206 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { FetchError, ofetch } from 'ofetch'; +import { TransformStream } from 'stream/web'; +import { z } from 'zod'; +import { APIConnectionError, APIError, APIStatusError, isAPIError } from '../../_exceptions.js'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import type { OverlappingSpeechEvent } from './types.js'; +import type { BoundedCache } from './utils.js'; + +export interface PostOptions { + baseUrl: string; + token: string; + signal?: AbortSignal; + timeout?: number; + maxRetries?: number; +} + +export interface PredictOptions { + threshold: number; + minFrames: number; +} + +export const predictEndpointResponseSchema = z.object({ + created_at: z.number(), + is_bargein: z.boolean(), + probabilities: z.array(z.number()), +}); + +export type PredictEndpointResponse = z.infer; + +export interface PredictResponse { + createdAt: number; + isBargein: boolean; + probabilities: number[]; + predictionDurationInS: number; +} + +export async function predictHTTP( + data: Int16Array, + predictOptions: PredictOptions, + options: PostOptions, +): Promise { + const createdAt = performance.now(); + const url = new URL(`/bargein`, options.baseUrl); + url.searchParams.append('threshold', predictOptions.threshold.toString()); + url.searchParams.append('min_frames', predictOptions.minFrames.toFixed()); + url.searchParams.append('created_at', createdAt.toFixed()); + + try { + const response = await ofetch(url.toString(), { + retry: 0, + headers: { + 'Content-Type': 'application/octet-stream', + Authorization: `Bearer ${options.token}`, + }, + signal: options.signal, + timeout: options.timeout, + method: 'POST', + body: data, + }); + const { created_at, is_bargein, probabilities } = predictEndpointResponseSchema.parse(response); + + return { + createdAt: created_at, + isBargein: is_bargein, + probabilities, + predictionDurationInS: (performance.now() - createdAt) / 1000, + }; + } catch (err) { + if (isAPIError(err)) throw err; + if (err instanceof FetchError) { + if (err.statusCode) { + throw new APIStatusError({ + message: `error during interruption prediction: ${err.message}`, + options: { statusCode: err.statusCode, body: err.data }, + }); + } + if ( + err.cause instanceof Error && + (err.cause.name === 'TimeoutError' || err.cause.name === 'AbortError') + ) { + throw new APIStatusError({ + message: `interruption inference timeout: ${err.message}`, + options: { statusCode: 408, retryable: false }, + }); + } + throw new APIConnectionError({ + message: `interruption inference connection error: ${err.message}`, + }); + } + throw new APIError(`error during interruption prediction: ${err}`); + } +} + +export interface HttpTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + threshold: number; + minFrames: number; + timeout: number; + maxRetries?: number; +} + +export interface HttpTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: BoundedCache; +} + +/** + * Creates an HTTP transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * Each audio slice triggers an HTTP POST request. + * + * @param options - Transport options object. This is read on each request, so mutations + * to threshold/minFrames will be picked up dynamically. + */ +export function createHttpTransport( + options: HttpTransportOptions, + getState: () => HttpTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, + getAndResetNumRequests?: () => number, +): TransformStream { + const logger = log(); + + return new TransformStream( + { + async transform(chunk, controller) { + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + const state = getState(); + const overlapSpeechStartedAt = state.overlapSpeechStartedAt; + if (overlapSpeechStartedAt === undefined || !state.overlapSpeechStarted) return; + + try { + const resp = await predictHTTP( + chunk, + { threshold: options.threshold, minFrames: options.minFrames }, + { + baseUrl: options.baseUrl, + timeout: options.timeout, + maxRetries: options.maxRetries, + token: await createAccessToken(options.apiKey, options.apiSecret), + }, + ); + + const { createdAt, isBargein, probabilities, predictionDurationInS } = resp; + const entry = state.cache.setOrUpdate( + createdAt, + () => new InterruptionCacheEntry({ createdAt }), + { + probabilities, + isInterruption: isBargein, + speechInput: chunk, + totalDurationInS: (performance.now() - createdAt) / 1000, + detectionDelayInS: (Date.now() - overlapSpeechStartedAt) / 1000, + predictionDurationInS, + }, + ); + + if (state.overlapSpeechStarted && entry.isInterruption) { + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + const event: OverlappingSpeechEvent = { + type: 'overlapping_speech', + detectedAt: Date.now(), + overlapStartedAt: overlapSpeechStartedAt, + isInterruption: entry.isInterruption, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + numRequests: getAndResetNumRequests?.() ?? 0, + }; + logger.debug( + { + detectionDelayInS: entry.detectionDelayInS, + totalDurationInS: entry.totalDurationInS, + }, + 'interruption detected', + ); + setState({ overlapSpeechStarted: false }); + controller.enqueue(event); + } + } catch (err) { + controller.error(err); + } + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); +} diff --git a/agents/src/inference/interruption/interruption_cache_entry.ts b/agents/src/inference/interruption/interruption_cache_entry.ts new file mode 100644 index 000000000..f318a04f5 --- /dev/null +++ b/agents/src/inference/interruption/interruption_cache_entry.ts @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { estimateProbability } from './utils.js'; + +/** + * Typed cache entry for interruption inference results. + * Mutable to support setOrUpdate pattern from Python's _BoundedCache. + */ +export class InterruptionCacheEntry { + createdAt: number; + requestStartedAt?: number; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + speechInput?: Int16Array; + probabilities?: number[]; + isInterruption?: boolean; + + constructor(params: { + createdAt: number; + requestStartedAt?: number; + speechInput?: Int16Array; + totalDurationInS?: number; + predictionDurationInS?: number; + detectionDelayInS?: number; + probabilities?: number[]; + isInterruption?: boolean; + }) { + this.createdAt = params.createdAt; + this.requestStartedAt = params.requestStartedAt; + this.totalDurationInS = params.totalDurationInS ?? 0; + this.predictionDurationInS = params.predictionDurationInS ?? 0; + this.detectionDelayInS = params.detectionDelayInS ?? 0; + this.speechInput = params.speechInput; + this.probabilities = params.probabilities; + this.isInterruption = params.isInterruption; + } + + /** + * The conservative estimated probability of the interruption event. + */ + get probability(): number { + return this.probabilities ? estimateProbability(this.probabilities) : 0; + } + + static default(): InterruptionCacheEntry { + return new InterruptionCacheEntry({ createdAt: 0 }); + } +} diff --git a/agents/src/inference/interruption/interruption_detector.ts b/agents/src/inference/interruption/interruption_detector.ts new file mode 100644 index 000000000..d115918f6 --- /dev/null +++ b/agents/src/inference/interruption/interruption_detector.ts @@ -0,0 +1,204 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { TypedEventEmitter } from '@livekit/typed-emitter'; +import EventEmitter from 'events'; +import { log } from '../../log.js'; +import type { InterruptionMetrics } from '../../metrics/base.js'; +import { DEFAULT_INFERENCE_URL, STAGING_INFERENCE_URL, getDefaultInferenceUrl } from '../utils.js'; +import { FRAMES_PER_SECOND, SAMPLE_RATE, interruptionOptionDefaults } from './defaults.js'; +import { InterruptionDetectionError } from './errors.js'; +import { InterruptionStreamBase } from './interruption_stream.js'; +import type { InterruptionOptions, OverlappingSpeechEvent } from './types.js'; + +type InterruptionCallbacks = { + overlapping_speech: (event: OverlappingSpeechEvent) => void; + metrics_collected: (metrics: InterruptionMetrics) => void; + error: (error: InterruptionDetectionError) => void; +}; + +export type AdaptiveInterruptionDetectorOptions = Omit, 'useProxy'>; + +export class AdaptiveInterruptionDetector extends (EventEmitter as new () => TypedEventEmitter) { + options: InterruptionOptions; + private readonly _label: string; + private logger = log(); + // Use Set instead of WeakSet to allow iteration for propagating option updates + private streams: Set = new Set(); + + constructor(options: AdaptiveInterruptionDetectorOptions = {}) { + super(); + + const { + maxAudioDurationInS, + baseUrl, + apiKey, + apiSecret, + audioPrefixDurationInS, + threshold, + detectionIntervalInS, + inferenceTimeout, + minInterruptionDurationInS, + } = { ...interruptionOptionDefaults, ...options }; + + if (maxAudioDurationInS > 3.0) { + throw new RangeError('maxAudioDurationInS must be less than or equal to 3.0 seconds'); + } + + const lkBaseUrl = baseUrl ?? process.env.LIVEKIT_REMOTE_EOT_URL ?? getDefaultInferenceUrl(); + let lkApiKey = apiKey ?? ''; + let lkApiSecret = apiSecret ?? ''; + let useProxy: boolean; + + // Use LiveKit credentials if using the inference service (production or staging) + const isInferenceUrl = + lkBaseUrl === DEFAULT_INFERENCE_URL || lkBaseUrl === STAGING_INFERENCE_URL; + if (isInferenceUrl) { + lkApiKey = + apiKey ?? process.env.LIVEKIT_INFERENCE_API_KEY ?? process.env.LIVEKIT_API_KEY ?? ''; + if (!lkApiKey) { + throw new TypeError( + 'apiKey is required, either as argument or set LIVEKIT_API_KEY environmental variable', + ); + } + + lkApiSecret = + apiSecret ?? + process.env.LIVEKIT_INFERENCE_API_SECRET ?? + process.env.LIVEKIT_API_SECRET ?? + ''; + if (!lkApiSecret) { + throw new TypeError( + 'apiSecret is required, either as argument or set LIVEKIT_API_SECRET environmental variable', + ); + } + useProxy = true; + } else { + useProxy = false; + } + const transport = useProxy ? 'websocket' : 'http'; + this.logger.debug( + { + baseUrl: lkBaseUrl, + useProxy, + transport, + }, + '=== Resolved interruption detector transport configuration', + ); + + this.options = { + sampleRate: SAMPLE_RATE, + threshold, + minFrames: Math.ceil(minInterruptionDurationInS * FRAMES_PER_SECOND), + maxAudioDurationInS, + audioPrefixDurationInS, + detectionIntervalInS, + inferenceTimeout, + baseUrl: lkBaseUrl, + apiKey: lkApiKey, + apiSecret: lkApiSecret, + useProxy, + minInterruptionDurationInS, + }; + + this._label = `${this.constructor.name}`; + + this.logger.debug( + { + baseUrl: this.options.baseUrl, + detectionIntervalInS: this.options.detectionIntervalInS, + audioPrefixDurationInS: this.options.audioPrefixDurationInS, + maxAudioDurationInS: this.options.maxAudioDurationInS, + minFrames: this.options.minFrames, + threshold: this.options.threshold, + inferenceTimeout: this.options.inferenceTimeout, + useProxy: this.options.useProxy, + transport, + }, + '=== Adaptive interruption detector initialized', + ); + } + + /** + * The model identifier for this detector. + */ + get model(): string { + return 'adaptive interruption'; + } + + /** + * The provider identifier for this detector. + */ + get provider(): string { + return 'livekit'; + } + + /** + * The label for this detector instance. + */ + get label(): string { + return this._label; + } + + /** + * The sample rate used for audio processing. + */ + get sampleRate(): number { + return this.options.sampleRate; + } + + /** + * Emit an error event from the detector. + */ + emitError(error: InterruptionDetectionError): void { + this.emit('error', error); + } + + /** + * Creates a new InterruptionStreamBase for internal use. + * The stream can receive audio frames and sentinels via pushFrame(). + * Use this when you need direct access to the stream for pushing frames. + */ + createStream(): InterruptionStreamBase { + try { + const streamBase = new InterruptionStreamBase(this, {}); + this.streams.add(streamBase); + return streamBase; + } catch (e) { + const cause = e instanceof Error ? e : new Error(String(e)); + this.emitError(new InterruptionDetectionError(cause.message, Date.now(), this._label, false)); + throw e; + } + } + + /** + * Remove a stream from tracking (called when stream is closed). + */ + removeStream(stream: InterruptionStreamBase): void { + this.streams.delete(stream); + } + + /** + * Update options for the detector and propagate to all active streams. + * For WebSocket streams, this triggers a reconnection with new settings. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + } + + // Propagate option updates to all active streams (matching Python behavior) + const updatePromises: Promise[] = []; + for (const stream of this.streams) { + updatePromises.push(stream.updateOptions(options)); + } + await Promise.all(updatePromises); + } +} diff --git a/agents/src/inference/interruption/interruption_stream.ts b/agents/src/inference/interruption/interruption_stream.ts new file mode 100644 index 000000000..df6162aae --- /dev/null +++ b/agents/src/inference/interruption/interruption_stream.ts @@ -0,0 +1,467 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; +import type { Span } from '@opentelemetry/api'; +import { type ReadableStream, TransformStream } from 'stream/web'; +import { log } from '../../log.js'; +import type { InterruptionMetrics } from '../../metrics/base.js'; +import { type StreamChannel, createStreamChannel } from '../../stream/stream_channel.js'; +import { traceTypes } from '../../telemetry/index.js'; +import { FRAMES_PER_SECOND, apiConnectDefaults } from './defaults.js'; +import type { InterruptionDetectionError } from './errors.js'; +import { createHttpTransport } from './http_transport.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import type { AdaptiveInterruptionDetector } from './interruption_detector.js'; +import { + type AgentSpeechEnded, + type AgentSpeechStarted, + type ApiConnectOptions, + type Flush, + type InterruptionOptions, + type InterruptionSentinel, + type OverlapSpeechEnded, + type OverlapSpeechStarted, + type OverlappingSpeechEvent, +} from './types.js'; +import { BoundedCache } from './utils.js'; +import { createWsTransport } from './ws_transport.js'; + +// Re-export sentinel types for backwards compatibility +export type { + AgentSpeechEnded, + AgentSpeechStarted, + ApiConnectOptions, + Flush, + InterruptionSentinel, + OverlapSpeechEnded, + OverlapSpeechStarted, +}; + +export class InterruptionStreamSentinel { + static agentSpeechStarted(): AgentSpeechStarted { + return { type: 'agent-speech-started' }; + } + + static agentSpeechEnded(): AgentSpeechEnded { + return { type: 'agent-speech-ended' }; + } + + static overlapSpeechStarted( + speechDuration: number, + startedAt: number, + userSpeakingSpan?: Span, + ): OverlapSpeechStarted { + return { type: 'overlap-speech-started', speechDuration, startedAt, userSpeakingSpan }; + } + + static overlapSpeechEnded(endedAt: number): OverlapSpeechEnded { + return { type: 'overlap-speech-ended', endedAt }; + } + + static flush(): Flush { + return { type: 'flush' }; + } +} + +function updateUserSpeakingSpan(span: Span, entry: InterruptionCacheEntry) { + span.setAttribute( + traceTypes.ATTR_IS_INTERRUPTION, + (entry.isInterruption ?? false).toString().toLowerCase(), + ); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PROBABILITY, entry.probability); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_TOTAL_DURATION, entry.totalDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_PREDICTION_DURATION, entry.predictionDurationInS); + span.setAttribute(traceTypes.ATTR_INTERRUPTION_DETECTION_DELAY, entry.detectionDelayInS); +} + +export class InterruptionStreamBase { + private inputStream: StreamChannel; + + private eventStream: ReadableStream; + + private resampler?: AudioResampler; + + private numRequests = 0; + + private userSpeakingSpan: Span | undefined; + + private overlapSpeechStartedAt: number | undefined; + + private options: InterruptionOptions; + + private apiOptions: ApiConnectOptions; + + private model: AdaptiveInterruptionDetector; + + private logger = log(); + + // Store reconnect function for WebSocket transport + private wsReconnect?: () => Promise; + + // Mutable transport options that can be updated via updateOptions() + private transportOptions: { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; + maxRetries: number; + }; + + constructor(model: AdaptiveInterruptionDetector, apiOptions: Partial) { + this.inputStream = createStreamChannel< + InterruptionSentinel | AudioFrame, + InterruptionDetectionError + >(); + + this.model = model; + this.options = { ...model.options }; + this.apiOptions = { ...apiConnectDefaults, ...apiOptions }; + + // Initialize mutable transport options + this.transportOptions = { + baseUrl: this.options.baseUrl, + apiKey: this.options.apiKey, + apiSecret: this.options.apiSecret, + sampleRate: this.options.sampleRate, + threshold: this.options.threshold, + minFrames: this.options.minFrames, + timeout: this.options.inferenceTimeout, + maxRetries: this.apiOptions.maxRetries, + }; + + this.eventStream = this.setupTransform(); + } + + /** + * Update stream options. For WebSocket transport, this triggers a reconnection. + */ + async updateOptions(options: { + threshold?: number; + minInterruptionDurationInS?: number; + }): Promise { + if (options.threshold !== undefined) { + this.options.threshold = options.threshold; + this.transportOptions.threshold = options.threshold; + } + if (options.minInterruptionDurationInS !== undefined) { + this.options.minInterruptionDurationInS = options.minInterruptionDurationInS; + this.options.minFrames = Math.ceil(options.minInterruptionDurationInS * FRAMES_PER_SECOND); + this.transportOptions.minFrames = this.options.minFrames; + } + // Trigger WebSocket reconnection if using proxy (WebSocket transport) + if (this.options.useProxy && this.wsReconnect) { + await this.wsReconnect(); + } + } + + private setupTransform(): ReadableStream { + let agentSpeechStarted = false; + let startIdx = 0; + let accumulatedSamples = 0; + let overlapSpeechStarted = false; + let overlapCount = 0; + const cache = new BoundedCache(10); + const inferenceS16Data = new Int16Array( + Math.ceil(this.options.maxAudioDurationInS * this.options.sampleRate), + ).fill(0); + + // State accessors for transport + const getState = () => ({ + overlapSpeechStarted, + overlapSpeechStartedAt: this.overlapSpeechStartedAt, + cache, + overlapCount, + }); + const setState = (partial: { overlapSpeechStarted?: boolean }) => { + if (partial.overlapSpeechStarted !== undefined) { + overlapSpeechStarted = partial.overlapSpeechStarted; + } + }; + const handleSpanUpdate = (entry: InterruptionCacheEntry) => { + if (this.userSpeakingSpan) { + updateUserSpeakingSpan(this.userSpeakingSpan, entry); + this.userSpeakingSpan = undefined; + } + }; + + const onRequestSent = () => { + this.numRequests++; + }; + + const getAndResetNumRequests = (): number => { + const n = this.numRequests; + this.numRequests = 0; + return n; + }; + + // First transform: process input frames/sentinels and output audio slices or events + const audioTransformer = new TransformStream< + InterruptionSentinel | AudioFrame, + Int16Array | OverlappingSpeechEvent + >( + { + transform: (chunk, controller) => { + if (chunk instanceof AudioFrame) { + if (!agentSpeechStarted) { + return; + } + if (this.options.sampleRate !== chunk.sampleRate) { + controller.error('the sample rate of the input frames must be consistent'); + this.logger.error('the sample rate of the input frames must be consistent'); + return; + } + const result = writeToInferenceS16Data( + chunk, + startIdx, + inferenceS16Data, + this.options.maxAudioDurationInS, + ); + startIdx = result.startIdx; + accumulatedSamples += result.samplesWritten; + + if ( + accumulatedSamples >= + Math.floor(this.options.detectionIntervalInS * this.options.sampleRate) && + overlapSpeechStarted + ) { + const audioSlice = inferenceS16Data.slice(0, startIdx); + accumulatedSamples = 0; + controller.enqueue(audioSlice); + } + } else if (chunk.type === 'agent-speech-started') { + this.logger.debug('agent speech started'); + agentSpeechStarted = true; + overlapSpeechStarted = false; + this.overlapSpeechStartedAt = undefined; + accumulatedSamples = 0; + overlapCount = 0; + startIdx = 0; + this.numRequests = 0; + cache.clear(); + } else if (chunk.type === 'agent-speech-ended') { + this.logger.debug('agent speech ended'); + agentSpeechStarted = false; + overlapSpeechStarted = false; + this.overlapSpeechStartedAt = undefined; + accumulatedSamples = 0; + overlapCount = 0; + startIdx = 0; + this.numRequests = 0; + cache.clear(); + } else if (chunk.type === 'overlap-speech-started' && agentSpeechStarted) { + this.overlapSpeechStartedAt = chunk.startedAt; + this.userSpeakingSpan = chunk.userSpeakingSpan; + this.logger.debug('overlap speech started, starting interruption inference'); + overlapSpeechStarted = true; + accumulatedSamples = 0; + overlapCount += 1; + if (overlapCount <= 1) { + const keepSize = + Math.round((chunk.speechDuration / 1000) * this.options.sampleRate) + + Math.round(this.options.audioPrefixDurationInS * this.options.sampleRate); + const shiftCount = Math.max(0, startIdx - keepSize); + inferenceS16Data.copyWithin(0, shiftCount, startIdx); + startIdx -= shiftCount; + } + cache.clear(); + } else if (chunk.type === 'overlap-speech-ended') { + this.logger.debug('overlap speech ended'); + if (overlapSpeechStarted) { + this.userSpeakingSpan = undefined; + let latestEntry = cache.pop( + (entry) => entry.totalDurationInS !== undefined && entry.totalDurationInS > 0, + ); + if (!latestEntry) { + this.logger.debug('no request made for overlap speech'); + latestEntry = InterruptionCacheEntry.default(); + } + const e = latestEntry ?? InterruptionCacheEntry.default(); + const event: OverlappingSpeechEvent = { + type: 'overlapping_speech', + detectedAt: chunk.endedAt, + isInterruption: false, + overlapStartedAt: this.overlapSpeechStartedAt, + speechInput: e.speechInput, + probabilities: e.probabilities, + totalDurationInS: e.totalDurationInS, + detectionDelayInS: e.detectionDelayInS, + predictionDurationInS: e.predictionDurationInS, + probability: e.probability, + numRequests: getAndResetNumRequests(), + }; + controller.enqueue(event); + overlapSpeechStarted = false; + accumulatedSamples = 0; + } + this.overlapSpeechStartedAt = undefined; + } else if (chunk.type === 'flush') { + // no-op + } + }, + }, + { highWaterMark: 32 }, + { highWaterMark: 32 }, + ); + + // Second transform: transport layer (HTTP or WebSocket based on useProxy) + const transportOptions = this.transportOptions; + + let transport: TransformStream; + if (this.options.useProxy) { + const wsResult = createWsTransport( + transportOptions, + getState, + setState, + handleSpanUpdate, + onRequestSent, + getAndResetNumRequests, + ); + transport = wsResult.transport; + this.wsReconnect = wsResult.reconnect; + } else { + transport = createHttpTransport( + transportOptions, + getState, + setState, + handleSpanUpdate, + getAndResetNumRequests, + ); + } + + const eventEmitter = new TransformStream({ + transform: (chunk, controller) => { + this.model.emit('overlapping_speech', chunk); + + const metrics: InterruptionMetrics = { + type: 'interruption_metrics', + timestamp: chunk.detectedAt, + totalDuration: chunk.totalDurationInS * 1000, + predictionDuration: chunk.predictionDurationInS * 1000, + detectionDelay: chunk.detectionDelayInS * 1000, + numInterruptions: chunk.isInterruption ? 1 : 0, + numBackchannels: chunk.isInterruption ? 0 : 1, + numRequests: chunk.numRequests, + metadata: { + modelProvider: this.model.provider, + modelName: this.model.model, + }, + }; + this.model.emit('metrics_collected', metrics); + + controller.enqueue(chunk); + }, + }); + + // Pipeline: input -> audioTransformer -> transport -> eventEmitter -> eventStream + return this.inputStream + .stream() + .pipeThrough(audioTransformer) + .pipeThrough(transport) + .pipeThrough(eventEmitter); + } + + private ensureInputNotEnded() { + if (this.inputStream.closed) { + throw new Error('input stream is closed'); + } + } + + private ensureStreamsNotEnded() { + this.ensureInputNotEnded(); + } + + private getResamplerFor(inputSampleRate: number): AudioResampler { + if (!this.resampler) { + this.resampler = new AudioResampler(inputSampleRate, this.options.sampleRate); + } + return this.resampler; + } + + stream(): ReadableStream { + return this.eventStream; + } + + async pushFrame(frame: InterruptionSentinel | AudioFrame): Promise { + this.ensureStreamsNotEnded(); + if (!(frame instanceof AudioFrame)) { + return this.inputStream.write(frame); + } else if (this.options.sampleRate !== frame.sampleRate) { + const resampler = this.getResamplerFor(frame.sampleRate); + if (resampler.inputRate !== frame.sampleRate) { + throw new Error('the sample rate of the input frames must be consistent'); + } + for (const resampledFrame of resampler.push(frame)) { + await this.inputStream.write(resampledFrame); + } + } else { + await this.inputStream.write(frame); + } + } + + async flush(): Promise { + this.ensureStreamsNotEnded(); + await this.inputStream.write(InterruptionStreamSentinel.flush()); + } + + async endInput(): Promise { + await this.flush(); + await this.inputStream.close(); + } + + async close(): Promise { + if (!this.inputStream.closed) await this.inputStream.close(); + this.model.removeStream(this); + } +} + +/** + * Write the audio frame to the output data array and return the new start index + * and the number of samples written. + */ +function writeToInferenceS16Data( + frame: AudioFrame, + startIdx: number, + outData: Int16Array, + maxAudioDuration: number, +): { startIdx: number; samplesWritten: number } { + const maxWindowSize = Math.floor(maxAudioDuration * frame.sampleRate); + + if (frame.samplesPerChannel > outData.length) { + throw new Error('frame samples are greater than the max window size'); + } + + // Shift the data to the left if the window would overflow + const shift = startIdx + frame.samplesPerChannel - maxWindowSize; + if (shift > 0) { + outData.copyWithin(0, shift, startIdx); + startIdx -= shift; + } + + // Get the frame data as Int16Array + const frameData = new Int16Array( + frame.data.buffer, + frame.data.byteOffset, + frame.samplesPerChannel * frame.channels, + ); + + if (frame.channels > 1) { + // Mix down multiple channels to mono by averaging + for (let i = 0; i < frame.samplesPerChannel; i++) { + let sum = 0; + for (let ch = 0; ch < frame.channels; ch++) { + sum += frameData[i * frame.channels + ch] ?? 0; + } + outData[startIdx + i] = Math.floor(sum / frame.channels); + } + } else { + // Single channel - copy directly + outData.set(frameData, startIdx); + } + + startIdx += frame.samplesPerChannel; + return { startIdx, samplesWritten: frame.samplesPerChannel }; +} diff --git a/agents/src/inference/interruption/types.ts b/agents/src/inference/interruption/types.ts new file mode 100644 index 000000000..d3aae7b95 --- /dev/null +++ b/agents/src/inference/interruption/types.ts @@ -0,0 +1,84 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { Span } from '@opentelemetry/api'; + +export interface OverlappingSpeechEvent { + type: 'overlapping_speech'; + detectedAt: number; + isInterruption: boolean; + totalDurationInS: number; + predictionDurationInS: number; + detectionDelayInS: number; + overlapStartedAt?: number; + speechInput?: Int16Array; + probabilities?: number[]; + probability: number; + numRequests: number; +} + +/** + * Configuration options for interruption detection. + */ +export interface InterruptionOptions { + sampleRate: number; + threshold: number; + minFrames: number; + maxAudioDurationInS: number; + audioPrefixDurationInS: number; + detectionIntervalInS: number; + inferenceTimeout: number; + minInterruptionDurationInS: number; + baseUrl: string; + apiKey: string; + apiSecret: string; + useProxy: boolean; +} + +/** + * API connection options for transport layers. + */ +export interface ApiConnectOptions { + maxRetries: number; + retryInterval: number; + timeout: number; +} + +// Sentinel types for stream control signals + +export interface AgentSpeechStarted { + type: 'agent-speech-started'; +} + +export interface AgentSpeechEnded { + type: 'agent-speech-ended'; +} + +export interface OverlapSpeechStarted { + type: 'overlap-speech-started'; + /** Duration of the speech segment in milliseconds (matches VADEvent.speechDuration units). */ + speechDuration: number; + /** Absolute timestamp (ms) when overlap speech started, computed at call-site. */ + startedAt: number; + userSpeakingSpan?: Span; +} + +export interface OverlapSpeechEnded { + type: 'overlap-speech-ended'; + /** Absolute timestamp (ms) when overlap speech ended, used as the non-interruption event timestamp. */ + endedAt: number; +} + +export interface Flush { + type: 'flush'; +} + +/** + * Union type for all stream control signals. + */ +export type InterruptionSentinel = + | AgentSpeechStarted + | AgentSpeechEnded + | OverlapSpeechStarted + | OverlapSpeechEnded + | Flush; diff --git a/agents/src/inference/interruption/utils.test.ts b/agents/src/inference/interruption/utils.test.ts new file mode 100644 index 000000000..79b585fe6 --- /dev/null +++ b/agents/src/inference/interruption/utils.test.ts @@ -0,0 +1,132 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { BoundedCache } from './utils.js'; + +class Entry { + createdAt: number; + totalDurationInS: number | undefined = undefined; + predictionDurationInS: number | undefined = undefined; + note: string | undefined = undefined; + + constructor(createdAt: number, note?: string) { + this.createdAt = createdAt; + this.note = note; + } +} + +describe('BoundedCache', () => { + it('evicts oldest entry when maxLen is exceeded', () => { + const cache = new BoundedCache(2); + cache.set(1, new Entry(1)); + cache.set(2, new Entry(2)); + cache.set(3, new Entry(3)); + + expect(cache.size).toBe(2); + expect([...cache.keys()]).toEqual([2, 3]); + expect(cache.get(1)).toBeUndefined(); + expect(cache.get(2)!.createdAt).toBe(2); + expect(cache.get(3)!.createdAt).toBe(3); + }); + + it('setOrUpdate creates a value via factory when key is missing', () => { + const cache = new BoundedCache(10); + const factory = vi.fn(() => new Entry(100)); + + const value = cache.setOrUpdate(1, factory, { predictionDurationInS: 0.42 }); + + expect(factory).toHaveBeenCalledTimes(1); + expect(value.createdAt).toBe(100); + expect(value.predictionDurationInS).toBe(0.42); + expect(cache.get(1)?.predictionDurationInS).toBe(0.42); + }); + + it('setOrUpdate updates existing value and does not call factory', () => { + const cache = new BoundedCache(10); + cache.set(1, new Entry(1, 'before')); + const factory = vi.fn(() => new Entry(999)); + + const value = cache.setOrUpdate(1, factory, { note: 'after', totalDurationInS: 1.5 }); + + expect(factory).not.toHaveBeenCalled(); + expect(value.createdAt).toBe(1); + expect(value.note).toBe('after'); + expect(value.totalDurationInS).toBe(1.5); + }); + + it('updateValue returns undefined for missing key', () => { + const cache = new BoundedCache(10); + const result = cache.updateValue(404, { note: 'missing' }); + + expect(result).toBeUndefined(); + }); + + it('updateValue ignores undefined fields', () => { + const cache = new BoundedCache(10); + cache.set(1, new Entry(1, 'keep')); + + const result = cache.updateValue(1, { + note: undefined, + predictionDurationInS: 0.1, + }); + + expect(result?.createdAt).toBe(1); + expect(result?.note).toBe('keep'); + expect(result?.predictionDurationInS).toBe(0.1); + }); + + it('pop without predicate removes the oldest entry (python parity)', () => { + const cache = new BoundedCache(10); + cache.set(1, new Entry(1)); + cache.set(2, new Entry(2)); + cache.set(3, new Entry(3)); + + const popped = cache.pop(); + + expect(popped?.createdAt).toBe(1); + expect([...cache.keys()]).toEqual([2, 3]); + }); + + it('pop with predicate removes the most recent matching entry', () => { + const cache = new BoundedCache(10); + const e1 = new Entry(1); + e1.totalDurationInS = 0; + const e2 = new Entry(2); + e2.totalDurationInS = 1; + const e3 = new Entry(3); + e3.totalDurationInS = 2; + cache.set(1, e1); + cache.set(2, e2); + cache.set(3, e3); + + const popped = cache.pop((entry) => (entry.totalDurationInS ?? 0) > 0); + + expect(popped?.createdAt).toBe(3); + expect(popped?.totalDurationInS).toBe(2); + expect([...cache.keys()]).toEqual([1, 2]); + }); + + it('pop with predicate returns undefined when no match exists', () => { + const cache = new BoundedCache(10); + const e1 = new Entry(1); + e1.totalDurationInS = 0; + cache.set(1, e1); + + const popped = cache.pop((entry) => (entry.totalDurationInS ?? 0) > 10); + + expect(popped).toBeUndefined(); + expect(cache.size).toBe(1); + }); + + it('clear removes all entries', () => { + const cache = new BoundedCache(10); + cache.set(1, new Entry(1)); + cache.set(2, new Entry(2)); + + cache.clear(); + + expect(cache.size).toBe(0); + expect([...cache.keys()]).toEqual([]); + }); +}); diff --git a/agents/src/inference/interruption/utils.ts b/agents/src/inference/interruption/utils.ts new file mode 100644 index 000000000..e614f3b6d --- /dev/null +++ b/agents/src/inference/interruption/utils.ts @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { FRAME_DURATION_IN_S, MIN_INTERRUPTION_DURATION_IN_S } from './defaults.js'; + +/** + * A bounded cache that automatically evicts the oldest entries when the cache exceeds max size. + * Uses FIFO eviction strategy. + */ +export class BoundedCache { + private cache: Map = new Map(); + private readonly maxLen: number; + + constructor(maxLen: number = 10) { + this.maxLen = maxLen; + } + + set(key: K, value: V): void { + this.cache.set(key, value); + if (this.cache.size > this.maxLen) { + // Remove the oldest entry (first inserted) + const firstKey = this.cache.keys().next().value as K; + this.cache.delete(firstKey); + } + } + + /** + * Update existing value fields if present and defined. + * Mirrors python BoundedDict.update_value behavior. + */ + updateValue(key: K, fields: Partial): V | undefined { + const value = this.cache.get(key); + if (!value) return value; + + for (const [fieldName, fieldValue] of Object.entries(fields) as [keyof V, V[keyof V]][]) { + if (fieldValue === undefined) continue; + // Runtime field update parity with python's hasattr + setattr. + if (fieldName in (value as object)) { + (value as Record)[String(fieldName)] = fieldValue; + } + } + return value; + } + + /** + * Set a new value with factory when missing; otherwise update in place. + * Mirrors python BoundedDict.set_or_update behavior. + */ + setOrUpdate(key: K, factory: () => V, fields: Partial): V { + if (!this.cache.has(key)) { + this.set(key, factory()); + } + const result = this.updateValue(key, fields); + if (!result) { + throw new Error('setOrUpdate invariant failed: entry should exist after set'); + } + return result; + } + + get(key: K): V | undefined { + return this.cache.get(key); + } + + has(key: K): boolean { + return this.cache.has(key); + } + + delete(key: K): boolean { + return this.cache.delete(key); + } + + /** + * Pop an entry if it satisfies the predicate. + * - No predicate: pop oldest (FIFO) + * - With predicate: search in reverse order and pop first match + */ + pop(predicate?: (value: V) => boolean): V | undefined { + if (predicate === undefined) { + const first = this.cache.entries().next().value as [K, V] | undefined; + if (!first) return undefined; + const [key, value] = first; + this.cache.delete(key); + return value; + } + + const keys = Array.from(this.cache.keys()); + for (let i = keys.length - 1; i >= 0; i--) { + const key = keys[i]!; + const value = this.cache.get(key)!; + if (predicate(value)) { + this.cache.delete(key); + return value; + } + } + return undefined; + } + + clear(): void { + this.cache.clear(); + } + + get size(): number { + return this.cache.size; + } + + values(): IterableIterator { + return this.cache.values(); + } + + keys(): IterableIterator { + return this.cache.keys(); + } + + entries(): IterableIterator<[K, V]> { + return this.cache.entries(); + } +} + +/** + * Estimate probability by finding the n-th maximum value in the probabilities array. + * The n-th position is determined by the window size (25ms per frame). + * Returns 0 if there are insufficient probabilities. + */ +export function estimateProbability( + probabilities: number[], + windowSizeInS: number = MIN_INTERRUPTION_DURATION_IN_S, +): number { + const nTh = Math.ceil(windowSizeInS / FRAME_DURATION_IN_S); + if (probabilities.length < nTh) { + return 0; + } + + // Find the n-th maximum value by sorting in descending order + // Create a copy to avoid mutating the original array + const sorted = [...probabilities].sort((a, b) => b - a); + return sorted[nTh - 1]!; +} diff --git a/agents/src/inference/interruption/ws_transport.ts b/agents/src/inference/interruption/ws_transport.ts new file mode 100644 index 000000000..ab708addd --- /dev/null +++ b/agents/src/inference/interruption/ws_transport.ts @@ -0,0 +1,406 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { TransformStream } from 'stream/web'; +import WebSocket from 'ws'; +import { z } from 'zod'; +import { APIConnectionError, APIStatusError, APITimeoutError } from '../../_exceptions.js'; +import { log } from '../../log.js'; +import { createAccessToken } from '../utils.js'; +import { InterruptionCacheEntry } from './interruption_cache_entry.js'; +import type { OverlappingSpeechEvent } from './types.js'; +import type { BoundedCache } from './utils.js'; + +// WebSocket message types +const MSG_SESSION_CREATE = 'session.create'; +const MSG_SESSION_CLOSE = 'session.close'; +const MSG_SESSION_CREATED = 'session.created'; +const MSG_SESSION_CLOSED = 'session.closed'; +const MSG_INTERRUPTION_DETECTED = 'bargein_detected'; +const MSG_INFERENCE_DONE = 'inference_done'; +const MSG_ERROR = 'error'; + +export interface WsTransportOptions { + baseUrl: string; + apiKey: string; + apiSecret: string; + sampleRate: number; + threshold: number; + minFrames: number; + timeout: number; + maxRetries?: number; +} + +export interface WsTransportState { + overlapSpeechStarted: boolean; + overlapSpeechStartedAt: number | undefined; + cache: BoundedCache; +} + +const wsMessageSchema = z.discriminatedUnion('type', [ + z.object({ + type: z.literal(MSG_SESSION_CREATED), + }), + z.object({ + type: z.literal(MSG_SESSION_CLOSED), + }), + z.object({ + type: z.literal(MSG_INTERRUPTION_DETECTED), + created_at: z.number(), + probabilities: z.array(z.number()).default([]), + prediction_duration: z.number().default(0), + }), + z.object({ + type: z.literal(MSG_INFERENCE_DONE), + created_at: z.number(), + probabilities: z.array(z.number()).default([]), + prediction_duration: z.number().default(0), + is_bargein: z.boolean().optional(), + }), + z.object({ + type: z.literal(MSG_ERROR), + message: z.string(), + code: z.number().optional(), + session_id: z.string().optional(), + }), +]); + +type WsMessage = z.infer; + +/** + * Creates a WebSocket connection and waits for it to open. + */ +async function connectWebSocket(options: WsTransportOptions): Promise { + const baseUrl = options.baseUrl.replace(/^http/, 'ws'); + const token = await createAccessToken(options.apiKey, options.apiSecret); + const url = `${baseUrl}/bargein`; + + const ws = new WebSocket(url, { + headers: { Authorization: `Bearer ${token}` }, + }); + + await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + ws.terminate(); + reject( + new APITimeoutError({ + message: 'WebSocket connection timeout', + options: { retryable: false }, + }), + ); + }, options.timeout); + ws.once('open', () => { + clearTimeout(timeout); + resolve(); + }); + ws.once('unexpected-response', (_req, res) => { + clearTimeout(timeout); + ws.terminate(); + const statusCode = res.statusCode ?? -1; + reject( + new APIStatusError({ + message: `WebSocket connection rejected with status ${statusCode}`, + options: { statusCode, retryable: false }, + }), + ); + }); + ws.once('error', (err: Error) => { + clearTimeout(timeout); + ws.terminate(); + reject(new APIConnectionError({ message: `WebSocket connection error: ${err.message}` })); + }); + }); + + return ws; +} + +export interface WsTransportResult { + transport: TransformStream; + reconnect: () => Promise; +} + +/** + * Creates a WebSocket transport TransformStream for interruption detection. + * + * This transport receives Int16Array audio slices and outputs InterruptionEvents. + * It maintains a persistent WebSocket connection with automatic retry on failure. + * Returns both the transport and a reconnect function for option updates. + */ +export function createWsTransport( + options: WsTransportOptions, + getState: () => WsTransportState, + setState: (partial: Partial) => void, + updateUserSpeakingSpan?: (entry: InterruptionCacheEntry) => void, + onRequestSent?: () => void, + getAndResetNumRequests?: () => number, +): WsTransportResult { + const logger = log(); + let ws: WebSocket | null = null; + let outputController: TransformStreamDefaultController | null = null; + + function setupMessageHandler(socket: WebSocket): void { + socket.on('message', (data: WebSocket.Data) => { + try { + const message = wsMessageSchema.parse(JSON.parse(data.toString())); + handleMessage(message); + } catch { + logger.warn({ data: data.toString() }, 'Failed to parse WebSocket message'); + } + }); + + socket.on('error', (err: Error) => { + outputController?.error( + new APIConnectionError({ message: `WebSocket error: ${err.message}` }), + ); + }); + + socket.on('close', (code: number, reason: Buffer) => { + logger.debug({ code, reason: reason.toString() }, 'WebSocket closed'); + }); + } + + async function ensureConnection(): Promise { + if (ws && ws.readyState === WebSocket.OPEN) return; + + ws = await connectWebSocket(options); + setupMessageHandler(ws); + + const sessionCreateMsg = JSON.stringify({ + type: MSG_SESSION_CREATE, + settings: { + sample_rate: options.sampleRate, + num_channels: 1, + threshold: options.threshold, + min_frames: options.minFrames, + encoding: 's16le', + }, + }); + ws.send(sessionCreateMsg); + } + + function handleMessage(message: WsMessage): void { + const state = getState(); + + switch (message.type) { + case MSG_SESSION_CREATED: + logger.debug('WebSocket session created'); + break; + + case MSG_INTERRUPTION_DETECTED: { + const createdAt = message.created_at; + const overlapSpeechStartedAt = state.overlapSpeechStartedAt; + if (state.overlapSpeechStarted && overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + + const totalDurationInS = + existing?.requestStartedAt !== undefined + ? (performance.now() - existing.requestStartedAt) / 1000 + : (performance.now() - createdAt) / 1000; + + const entry = state.cache.setOrUpdate( + createdAt, + () => new InterruptionCacheEntry({ createdAt }), + { + speechInput: existing?.speechInput, + requestStartedAt: existing?.requestStartedAt, + totalDurationInS, + probabilities: message.probabilities, + isInterruption: true, + predictionDurationInS: message.prediction_duration, + detectionDelayInS: (Date.now() - overlapSpeechStartedAt) / 1000, + }, + ); + + if (updateUserSpeakingSpan) { + updateUserSpeakingSpan(entry); + } + + logger.debug( + { + totalDuration: entry.totalDurationInS, + predictionDuration: entry.predictionDurationInS, + detectionDelay: entry.detectionDelayInS, + probability: entry.probability, + }, + 'interruption detected', + ); + + const event: OverlappingSpeechEvent = { + type: 'overlapping_speech', + detectedAt: Date.now(), + isInterruption: true, + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + overlapStartedAt: overlapSpeechStartedAt, + speechInput: entry.speechInput, + probabilities: entry.probabilities, + detectionDelayInS: entry.detectionDelayInS, + probability: entry.probability, + numRequests: getAndResetNumRequests?.() ?? 0, + }; + + outputController?.enqueue(event); + setState({ overlapSpeechStarted: false }); + } + break; + } + + case MSG_INFERENCE_DONE: { + const createdAt = message.created_at; + const overlapSpeechStartedAt = state.overlapSpeechStartedAt; + if (state.overlapSpeechStarted && overlapSpeechStartedAt !== undefined) { + const existing = state.cache.get(createdAt); + const totalDurationInS = + existing?.requestStartedAt !== undefined + ? (performance.now() - existing.requestStartedAt) / 1000 + : (performance.now() - createdAt) / 1000; + const entry = state.cache.setOrUpdate( + createdAt, + () => new InterruptionCacheEntry({ createdAt }), + { + speechInput: existing?.speechInput, + requestStartedAt: existing?.requestStartedAt, + totalDurationInS, + predictionDurationInS: message.prediction_duration, + probabilities: message.probabilities, + isInterruption: message.is_bargein ?? false, + detectionDelayInS: (Date.now() - overlapSpeechStartedAt) / 1000, + }, + ); + + logger.debug( + { + totalDurationInS: entry.totalDurationInS, + predictionDurationInS: entry.predictionDurationInS, + }, + 'interruption inference done', + ); + } + break; + } + + case MSG_SESSION_CLOSED: + logger.debug('WebSocket session closed'); + break; + + case MSG_ERROR: + outputController?.error( + new APIStatusError({ + message: `LiveKit Adaptive Interruption error: ${message.message}`, + options: { statusCode: message.code ?? -1 }, + }), + ); + break; + } + } + + function sendAudioData(audioSlice: Int16Array): void { + if (!ws || ws.readyState !== WebSocket.OPEN) { + throw new APIConnectionError({ message: 'WebSocket not connected' }); + } + + const state = getState(); + const createdAt = Math.floor(performance.now()); + + state.cache.set( + createdAt, + new InterruptionCacheEntry({ + createdAt, + requestStartedAt: performance.now(), + speechInput: audioSlice, + }), + ); + + const header = new ArrayBuffer(8); + const view = new DataView(header); + view.setUint32(0, createdAt >>> 0, true); + view.setUint32(4, Math.floor(createdAt / 0x100000000) >>> 0, true); + + const audioBytes = new Uint8Array( + audioSlice.buffer, + audioSlice.byteOffset, + audioSlice.byteLength, + ); + const combined = new Uint8Array(8 + audioBytes.length); + combined.set(new Uint8Array(header), 0); + combined.set(audioBytes, 8); + + ws.send(combined); + onRequestSent?.(); + } + + function close(): void { + if (ws?.readyState === WebSocket.OPEN) { + const closeMsg = JSON.stringify({ type: MSG_SESSION_CLOSE }); + try { + ws.send(closeMsg); + } catch (e: unknown) { + logger.error(e, 'failed to send close message'); + } + } + ws?.close(1000); // signal normal websocket closure + ws = null; + } + + /** + * Reconnect the WebSocket with updated options. + * This is called when options are updated via updateOptions(). + */ + async function reconnect(): Promise { + close(); + } + + const transport = new TransformStream< + Int16Array | OverlappingSpeechEvent, + OverlappingSpeechEvent + >( + { + async start(controller) { + outputController = controller; + await ensureConnection(); + }, + + transform(chunk, controller) { + if (!(chunk instanceof Int16Array)) { + controller.enqueue(chunk); + return; + } + + // Only forwards buffered audio while overlap speech is actively on. + const state = getState(); + if (!state.overlapSpeechStartedAt || !state.overlapSpeechStarted) return; + + if (options.timeout > 0) { + const now = performance.now(); + for (const [, entry] of state.cache.entries()) { + if (entry.totalDurationInS !== 0) continue; + if (now - entry.createdAt > options.timeout) { + controller.error( + new APIStatusError({ + message: `interruption inference timed out after ${((now - entry.createdAt) / 1000).toFixed(1)}s (ws)`, + options: { statusCode: 408, retryable: false }, + }), + ); + return; + } + break; + } + } + + try { + sendAudioData(chunk); + } catch (err) { + controller.error(err); + } + }, + + flush() { + close(); + }, + }, + { highWaterMark: 2 }, + { highWaterMark: 2 }, + ); + + return { transport, reconnect }; +} diff --git a/agents/src/inference/llm.ts b/agents/src/inference/llm.ts index 312a97a0c..b731672d4 100644 --- a/agents/src/inference/llm.ts +++ b/agents/src/inference/llm.ts @@ -4,12 +4,10 @@ import OpenAI from 'openai'; import { APIConnectionError, APIStatusError, APITimeoutError } from '../_exceptions.js'; import * as llm from '../llm/index.js'; -import { DEFAULT_API_CONNECT_OPTIONS } from '../types.js'; import type { APIConnectOptions } from '../types.js'; +import { DEFAULT_API_CONNECT_OPTIONS } from '../types.js'; import { type Expand, toError } from '../utils.js'; -import { type AnyString, createAccessToken } from './utils.js'; - -const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; +import { type AnyString, createAccessToken, getDefaultInferenceUrl } from './utils.js'; export type OpenAIModels = | 'openai/gpt-5.4' @@ -124,7 +122,7 @@ export class LLM extends llm.LLM { strictToolSchema = false, } = opts; - const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL; + const lkBaseURL = baseURL || getDefaultInferenceUrl(); const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY; if (!lkApiKey) { throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY'); @@ -160,6 +158,10 @@ export class LLM extends llm.LLM { return this.opts.model; } + get provider(): string { + return 'livekit'; + } + static fromModelString(modelString: string): LLM { return new LLM({ model: modelString }); } diff --git a/agents/src/inference/stt.ts b/agents/src/inference/stt.ts index e8b7b666d..1da16e179 100644 --- a/agents/src/inference/stt.ts +++ b/agents/src/inference/stt.ts @@ -23,7 +23,7 @@ import { type SttTranscriptEvent, sttServerEventSchema, } from './api_protos.js'; -import { type AnyString, connectWs, createAccessToken } from './utils.js'; +import { type AnyString, connectWs, createAccessToken, getDefaultInferenceUrl } from './utils.js'; export type DeepgramModels = | 'deepgram/flux-general' @@ -152,7 +152,6 @@ export type STTEncoding = 'pcm_s16le'; const DEFAULT_ENCODING: STTEncoding = 'pcm_s16le'; const DEFAULT_SAMPLE_RATE = 16000; -const DEFAULT_BASE_URL = 'wss://agent-gateway.livekit.cloud/v1'; const DEFAULT_CANCEL_TIMEOUT = 5000; export interface InferenceSTTOptions { @@ -204,7 +203,7 @@ export class STT extends BaseSTT { connOptions, } = opts || {}; - const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL; + const lkBaseURL = baseURL || getDefaultInferenceUrl(); const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY; if (!lkApiKey) { throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY'); @@ -253,6 +252,14 @@ export class STT extends BaseSTT { return 'inference.STT'; } + get model(): string { + return this.opts.model ?? 'auto'; + } + + get provider(): string { + return 'livekit'; + } + static fromModelString(modelString: string): STT { const [model, language] = parseSTTModelString(modelString); return new STT({ model, language }); diff --git a/agents/src/inference/tts.ts b/agents/src/inference/tts.ts index 8c32672cb..2e83c619f 100644 --- a/agents/src/inference/tts.ts +++ b/agents/src/inference/tts.ts @@ -20,7 +20,7 @@ import { ttsClientEventSchema, ttsServerEventSchema, } from './api_protos.js'; -import { type AnyString, connectWs, createAccessToken } from './utils.js'; +import { type AnyString, connectWs, createAccessToken, getDefaultInferenceUrl } from './utils.js'; export type CartesiaModels = | 'cartesia/sonic-3' @@ -144,7 +144,6 @@ type TTSEncoding = 'pcm_s16le'; const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le'; const DEFAULT_SAMPLE_RATE = 16000; -const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1'; const NUM_CHANNELS = 1; const DEFAULT_LANGUAGE = 'en'; @@ -201,7 +200,7 @@ export class TTS extends BaseTTS { connOptions, } = opts || {}; - const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL; + const lkBaseURL = baseURL || getDefaultInferenceUrl(); const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY; if (!lkApiKey) { throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY'); @@ -262,6 +261,14 @@ export class TTS extends BaseTTS { return 'inference.TTS'; } + get model(): string { + return this.opts.model ?? 'unknown'; + } + + get provider(): string { + return 'livekit'; + } + static fromModelString(modelString: string): TTS { const [model, voice] = parseTTSModelString(modelString); return new TTS({ model, voice: voice || undefined }); diff --git a/agents/src/inference/utils.ts b/agents/src/inference/utils.ts index 801e89319..a80017c0b 100644 --- a/agents/src/inference/utils.ts +++ b/agents/src/inference/utils.ts @@ -7,6 +7,34 @@ import { APIConnectionError, APIStatusError } from '../_exceptions.js'; export type AnyString = string & NonNullable; +/** Default production inference URL */ +export const DEFAULT_INFERENCE_URL = 'https://agent-gateway.livekit.cloud/v1'; + +/** Staging inference URL */ +export const STAGING_INFERENCE_URL = 'https://agent-gateway.staging.livekit.cloud/v1'; + +/** + * Get the default inference URL based on the environment. + * + * Priority: + * 1. LIVEKIT_INFERENCE_URL if set + * 2. If LIVEKIT_URL contains '.staging.livekit.cloud', use staging gateway + * 3. Otherwise, use production gateway + */ +export function getDefaultInferenceUrl(): string { + const inferenceUrl = process.env.LIVEKIT_INFERENCE_URL; + if (inferenceUrl) { + return inferenceUrl; + } + + const livekitUrl = process.env.LIVEKIT_URL || ''; + if (livekitUrl.includes('.staging.livekit.cloud')) { + return STAGING_INFERENCE_URL; + } + + return DEFAULT_INFERENCE_URL; +} + export async function createAccessToken( apiKey: string, apiSecret: string, diff --git a/agents/src/job.ts b/agents/src/job.ts index e8ad3fac2..87e67195b 100644 --- a/agents/src/job.ts +++ b/agents/src/job.ts @@ -276,7 +276,7 @@ export class JobContext { jobId: this.job.id, roomId: this.job.room?.sid || '', room: this.job.room?.name || '', - options: targetSession.options, + options: targetSession.sessionOptions, events: targetSession._recordedEvents, enableRecording: targetSession._enableRecording, chatHistory: targetSession.history.copy(), diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 5ac15c0c7..114b4ac75 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -81,6 +81,17 @@ export function createAudioContent(params: { }; } +export interface MetricsReport { + startedSpeakingAt?: number; + stoppedSpeakingAt?: number; + transcriptionDelay?: number; + endOfTurnDelay?: number; + onUserTurnCompletedDelay?: number; + llmNodeTtft?: number; + ttsNodeTtfb?: number; + e2eLatency?: number; +} + export class ChatMessage { readonly id: string; @@ -92,18 +103,24 @@ export class ChatMessage { interrupted: boolean; + transcriptConfidence?: number; + + extra: Record; + + metrics: MetricsReport; + hash?: Uint8Array; createdAt: number; - extra: Record; - constructor(params: { role: ChatRole; content: ChatContent[] | string; id?: string; interrupted?: boolean; createdAt?: number; + transcriptConfidence?: number; + metrics?: MetricsReport; extra?: Record; }) { const { @@ -112,6 +129,8 @@ export class ChatMessage { id = shortuuid('item_'), interrupted = false, createdAt = Date.now(), + transcriptConfidence, + metrics = {}, extra = {}, } = params; this.id = id; @@ -119,6 +138,8 @@ export class ChatMessage { this.content = Array.isArray(content) ? content : [content]; this.interrupted = interrupted; this.createdAt = createdAt; + this.transcriptConfidence = transcriptConfidence; + this.metrics = metrics; this.extra = extra; } @@ -128,6 +149,8 @@ export class ChatMessage { id?: string; interrupted?: boolean; createdAt?: number; + transcriptConfidence?: number; + metrics?: MetricsReport; extra?: Record; }) { return new ChatMessage(params); @@ -179,6 +202,16 @@ export class ChatMessage { result.createdAt = this.createdAt; } + if (this.transcriptConfidence !== undefined) { + result.transcriptConfidence = this.transcriptConfidence; + } + if (Object.keys(this.metrics).length > 0) { + result.metrics = { ...this.metrics }; + } + if (Object.keys(this.extra).length > 0) { + result.extra = this.extra as JSONValue; + } + return result; } } @@ -439,6 +472,8 @@ export class ChatContext { id?: string; interrupted?: boolean; createdAt?: number; + transcriptConfidence?: number; + metrics?: MetricsReport; extra?: Record; }): ChatMessage { const msg = new ChatMessage(params); @@ -623,6 +658,9 @@ export class ChatContext { id: item.id, interrupted: item.interrupted, createdAt: item.createdAt, + transcriptConfidence: item.transcriptConfidence, + metrics: item.metrics, + extra: item.extra, }); // Filter content based on options diff --git a/agents/src/llm/index.ts b/agents/src/llm/index.ts index 3eb5b2117..e68ee9380 100644 --- a/agents/src/llm/index.ts +++ b/agents/src/llm/index.ts @@ -30,6 +30,7 @@ export { type ChatItem, type ChatRole, type ImageContent, + type MetricsReport, } from './chat_context.js'; export type { ProviderFormat } from './provider_format/index.js'; diff --git a/agents/src/llm/llm.ts b/agents/src/llm/llm.ts index 624bea490..a71bd4714 100644 --- a/agents/src/llm/llm.ts +++ b/agents/src/llm/llm.ts @@ -65,6 +65,18 @@ export abstract class LLM extends (EventEmitter as new () => TypedEmitter { } return (usage?.completionTokens || 0) / (durationMs / 1000); })(), + metadata: { + modelProvider: this.#llm.provider, + modelName: this.#llm.model, + }, }; if (this.#llmRequestSpan) { diff --git a/agents/src/llm/realtime.ts b/agents/src/llm/realtime.ts index 5c132afd0..864e25d2d 100644 --- a/agents/src/llm/realtime.ts +++ b/agents/src/llm/realtime.ts @@ -73,6 +73,10 @@ export abstract class RealtimeModel { /** The model name/identifier used by this realtime model */ abstract get model(): string; + get provider(): string { + return 'unknown'; + } + abstract session(): RealtimeSession; abstract close(): Promise; diff --git a/agents/src/metrics/base.ts b/agents/src/metrics/base.ts index 7f6d6a0cc..1c9c317c1 100644 --- a/agents/src/metrics/base.ts +++ b/agents/src/metrics/base.ts @@ -2,13 +2,21 @@ // // SPDX-License-Identifier: Apache-2.0 +export type MetricsMetadata = { + /** The provider name (e.g., 'openai', 'anthropic'). */ + modelProvider?: string; + /** The model name (e.g., 'gpt-4o', 'claude-3-5-sonnet'). */ + modelName?: string; +}; + export type AgentMetrics = | STTMetrics | LLMMetrics | TTSMetrics | VADMetrics | EOUMetrics - | RealtimeModelMetrics; + | RealtimeModelMetrics + | InterruptionMetrics; export type LLMMetrics = { type: 'llm_metrics'; @@ -26,6 +34,8 @@ export type LLMMetrics = { totalTokens: number; tokensPerSecond: number; speechId?: string; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type STTMetrics = { @@ -41,10 +51,16 @@ export type STTMetrics = { * The duration of the pushed audio in milliseconds. */ audioDurationMs: number; + /** Input audio tokens (for token-based billing). */ + inputTokens?: number; + /** Output text tokens (for token-based billing). */ + outputTokens?: number; /** * Whether the STT is streaming (e.g using websocket). */ streamed: boolean; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type TTSMetrics = { @@ -59,10 +75,17 @@ export type TTSMetrics = { /** Generated audio duration in milliseconds. */ audioDurationMs: number; cancelled: boolean; + /** Number of characters synthesized (for character-based billing). */ charactersCount: number; + /** Input text tokens (for token-based billing, e.g., OpenAI TTS). */ + inputTokens?: number; + /** Output audio tokens (for token-based billing, e.g., OpenAI TTS). */ + outputTokens?: number; streamed: boolean; segmentId?: string; speechId?: string; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; }; export type VADMetrics = { @@ -133,6 +156,10 @@ export type RealtimeModelMetrics = { * The duration of the response from created to done in milliseconds. */ durationMs: number; + /** + * The duration of the session connection in milliseconds (for session-based billing like xAI). + */ + sessionDurationMs?: number; /** * Time to first audio token in milliseconds. -1 if no audio token was sent. */ @@ -165,4 +192,24 @@ export type RealtimeModelMetrics = { * Details about the output tokens used in the Response. */ outputTokenDetails: RealtimeModelMetricsOutputTokenDetails; + /** Metadata for model provider and name tracking. */ + metadata?: MetricsMetadata; +}; + +export type InterruptionMetrics = { + type: 'interruption_metrics'; + timestamp: number; + /** Latest RTT time taken to perform inference, in milliseconds. */ + totalDuration: number; + /** Latest time taken by the model side, in milliseconds. */ + predictionDuration: number; + /** Latest total time from onset of speech to final prediction, in milliseconds. */ + detectionDelay: number; + /** Number of interruptions detected (incremental). */ + numInterruptions: number; + /** Number of backchannels detected (incremental). */ + numBackchannels: number; + /** Number of requests sent to the model (incremental). */ + numRequests: number; + metadata?: MetricsMetadata; }; diff --git a/agents/src/metrics/index.ts b/agents/src/metrics/index.ts index f400a9638..f3cce796c 100644 --- a/agents/src/metrics/index.ts +++ b/agents/src/metrics/index.ts @@ -5,11 +5,22 @@ export type { AgentMetrics, EOUMetrics, + InterruptionMetrics, LLMMetrics, + MetricsMetadata, RealtimeModelMetrics, STTMetrics, TTSMetrics, VADMetrics, } from './base.js'; +export { + filterZeroValues, + ModelUsageCollector, + type InterruptionModelUsage, + type LLMModelUsage, + type ModelUsage, + type STTModelUsage, + type TTSModelUsage, +} from './model_usage.js'; export { UsageCollector, type UsageSummary } from './usage_collector.js'; export { logMetrics } from './utils.js'; diff --git a/agents/src/metrics/model_usage.test.ts b/agents/src/metrics/model_usage.test.ts new file mode 100644 index 000000000..d2f983beb --- /dev/null +++ b/agents/src/metrics/model_usage.test.ts @@ -0,0 +1,545 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { beforeEach, describe, expect, it } from 'vitest'; +import type { LLMMetrics, RealtimeModelMetrics, STTMetrics, TTSMetrics } from './base.js'; +import { + type LLMModelUsage, + ModelUsageCollector, + type STTModelUsage, + type TTSModelUsage, + filterZeroValues, +} from './model_usage.js'; + +describe('model_usage', () => { + describe('filterZeroValues', () => { + it('should filter out zero values from LLMModelUsage', () => { + const usage: LLMModelUsage = { + type: 'llm_usage', + provider: 'openai', + model: 'gpt-4o', + inputTokens: 100, + inputCachedTokens: 0, + inputAudioTokens: 0, + inputCachedAudioTokens: 0, + inputTextTokens: 0, + inputCachedTextTokens: 0, + inputImageTokens: 0, + inputCachedImageTokens: 0, + outputTokens: 50, + outputAudioTokens: 0, + outputTextTokens: 0, + sessionDurationMs: 0, + }; + + const filtered = filterZeroValues(usage); + + expect(filtered.type).toBe('llm_usage'); + expect(filtered.provider).toBe('openai'); + expect(filtered.model).toBe('gpt-4o'); + expect(filtered.inputTokens).toBe(100); + expect(filtered.outputTokens).toBe(50); + // Zero values should be filtered out + expect(filtered.inputCachedTokens).toBeUndefined(); + expect(filtered.inputAudioTokens).toBeUndefined(); + expect(filtered.sessionDurationMs).toBeUndefined(); + }); + + it('should filter out zero values from TTSModelUsage', () => { + const usage: TTSModelUsage = { + type: 'tts_usage', + provider: 'elevenlabs', + model: 'eleven_turbo_v2', + inputTokens: 0, + outputTokens: 0, + charactersCount: 500, + audioDurationMs: 3000, + }; + + const filtered = filterZeroValues(usage); + + expect(filtered.type).toBe('tts_usage'); + expect(filtered.provider).toBe('elevenlabs'); + expect(filtered.charactersCount).toBe(500); + expect(filtered.audioDurationMs).toBe(3000); + expect(filtered.inputTokens).toBeUndefined(); + expect(filtered.outputTokens).toBeUndefined(); + }); + + it('should keep all values when none are zero', () => { + const usage: STTModelUsage = { + type: 'stt_usage', + provider: 'deepgram', + model: 'nova-2', + inputTokens: 10, + outputTokens: 20, + audioDurationMs: 5000, + }; + + const filtered = filterZeroValues(usage); + + expect(Object.keys(filtered)).toHaveLength(6); + expect(filtered).toEqual(usage); + }); + }); + + describe('ModelUsageCollector', () => { + let collector: ModelUsageCollector; + + beforeEach(() => { + collector = new ModelUsageCollector(); + }); + + describe('collect LLM metrics', () => { + it('should aggregate LLM metrics by provider and model', () => { + const metrics1: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 50, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const metrics2: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 150, + ttftMs: 60, + cancelled: false, + completionTokens: 150, + promptTokens: 300, + promptCachedTokens: 75, + totalTokens: 450, + tokensPerSecond: 12, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.type).toBe('llm_usage'); + expect(llmUsage.provider).toBe('openai'); + expect(llmUsage.model).toBe('gpt-4o'); + expect(llmUsage.inputTokens).toBe(500); // 200 + 300 + expect(llmUsage.inputCachedTokens).toBe(125); // 50 + 75 + expect(llmUsage.outputTokens).toBe(250); // 100 + 150 + }); + + it('should separate metrics by different providers', () => { + const openaiMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const anthropicMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 120, + ttftMs: 55, + cancelled: false, + completionTokens: 80, + promptTokens: 150, + promptCachedTokens: 0, + totalTokens: 230, + tokensPerSecond: 8, + metadata: { + modelProvider: 'anthropic', + modelName: 'claude-3-5-sonnet', + }, + }; + + collector.collect(openaiMetrics); + collector.collect(anthropicMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(2); + + const openaiUsage = usage.find( + (u) => u.type === 'llm_usage' && u.provider === 'openai', + ) as LLMModelUsage; + const anthropicUsage = usage.find( + (u) => u.type === 'llm_usage' && u.provider === 'anthropic', + ) as LLMModelUsage; + + expect(openaiUsage.inputTokens).toBe(200); + expect(openaiUsage.outputTokens).toBe(100); + expect(anthropicUsage.inputTokens).toBe(150); + expect(anthropicUsage.outputTokens).toBe(80); + }); + }); + + describe('collect TTS metrics', () => { + it('should aggregate TTS metrics by provider and model', () => { + const metrics1: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + ttfbMs: 100, + durationMs: 500, + audioDurationMs: 3000, + cancelled: false, + charactersCount: 100, + inputTokens: 10, + outputTokens: 20, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + const metrics2: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + ttfbMs: 120, + durationMs: 600, + audioDurationMs: 4000, + cancelled: false, + charactersCount: 200, + inputTokens: 15, + outputTokens: 25, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const ttsUsage = usage[0] as TTSModelUsage; + expect(ttsUsage.type).toBe('tts_usage'); + expect(ttsUsage.provider).toBe('elevenlabs'); + expect(ttsUsage.model).toBe('eleven_turbo_v2'); + expect(ttsUsage.charactersCount).toBe(300); // 100 + 200 + expect(ttsUsage.audioDurationMs).toBe(7000); // 3000 + 4000 + expect(ttsUsage.inputTokens).toBe(25); // 10 + 15 + expect(ttsUsage.outputTokens).toBe(45); // 20 + 25 + }); + }); + + describe('collect STT metrics', () => { + it('should aggregate STT metrics by provider and model', () => { + const metrics1: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 5000, + inputTokens: 50, + outputTokens: 100, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + const metrics2: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 3000, + inputTokens: 30, + outputTokens: 60, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + collector.collect(metrics1); + collector.collect(metrics2); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const sttUsage = usage[0] as STTModelUsage; + expect(sttUsage.type).toBe('stt_usage'); + expect(sttUsage.provider).toBe('deepgram'); + expect(sttUsage.model).toBe('nova-2'); + expect(sttUsage.audioDurationMs).toBe(8000); // 5000 + 3000 + expect(sttUsage.inputTokens).toBe(80); // 50 + 30 + expect(sttUsage.outputTokens).toBe(160); // 100 + 60 + }); + }); + + describe('collect realtime model metrics', () => { + it('should aggregate realtime model metrics with detailed token breakdown', () => { + const metrics: RealtimeModelMetrics = { + type: 'realtime_model_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 1000, + ttftMs: 100, + cancelled: false, + inputTokens: 500, + outputTokens: 300, + totalTokens: 800, + tokensPerSecond: 10, + sessionDurationMs: 5000, + inputTokenDetails: { + audioTokens: 200, + textTokens: 250, + imageTokens: 50, + cachedTokens: 100, + cachedTokensDetails: { + audioTokens: 30, + textTokens: 50, + imageTokens: 20, + }, + }, + outputTokenDetails: { + textTokens: 200, + audioTokens: 100, + imageTokens: 0, + }, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o-realtime', + }, + }; + + collector.collect(metrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.type).toBe('llm_usage'); + expect(llmUsage.provider).toBe('openai'); + expect(llmUsage.model).toBe('gpt-4o-realtime'); + expect(llmUsage.inputTokens).toBe(500); + expect(llmUsage.inputCachedTokens).toBe(100); + expect(llmUsage.inputAudioTokens).toBe(200); + expect(llmUsage.inputCachedAudioTokens).toBe(30); + expect(llmUsage.inputTextTokens).toBe(250); + expect(llmUsage.inputCachedTextTokens).toBe(50); + expect(llmUsage.inputImageTokens).toBe(50); + expect(llmUsage.inputCachedImageTokens).toBe(20); + expect(llmUsage.outputTokens).toBe(300); + expect(llmUsage.outputTextTokens).toBe(200); + expect(llmUsage.outputAudioTokens).toBe(100); + expect(llmUsage.sessionDurationMs).toBe(5000); + }); + }); + + describe('mixed metrics collection', () => { + it('should collect and separate LLM, TTS, and STT metrics', () => { + const llmMetrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + const ttsMetrics: TTSMetrics = { + type: 'tts_metrics', + label: 'test', + requestId: 'req2', + timestamp: Date.now(), + ttfbMs: 100, + durationMs: 500, + audioDurationMs: 3000, + cancelled: false, + charactersCount: 100, + streamed: true, + metadata: { + modelProvider: 'elevenlabs', + modelName: 'eleven_turbo_v2', + }, + }; + + const sttMetrics: STTMetrics = { + type: 'stt_metrics', + label: 'test', + requestId: 'req3', + timestamp: Date.now(), + durationMs: 0, + audioDurationMs: 5000, + streamed: true, + metadata: { + modelProvider: 'deepgram', + modelName: 'nova-2', + }, + }; + + collector.collect(llmMetrics); + collector.collect(ttsMetrics); + collector.collect(sttMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(3); + + const llmUsage = usage.find((u) => u.type === 'llm_usage'); + const ttsUsage = usage.find((u) => u.type === 'tts_usage'); + const sttUsage = usage.find((u) => u.type === 'stt_usage'); + + expect(llmUsage).toBeDefined(); + expect(ttsUsage).toBeDefined(); + expect(sttUsage).toBeDefined(); + }); + }); + + describe('flatten returns copies', () => { + it('should return deep copies of usage objects', () => { + const metrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + metadata: { + modelProvider: 'openai', + modelName: 'gpt-4o', + }, + }; + + collector.collect(metrics); + + const usage1 = collector.flatten(); + const usage2 = collector.flatten(); + + // Should be equal values + expect(usage1[0]).toEqual(usage2[0]); + + // But not the same object reference + expect(usage1[0]).not.toBe(usage2[0]); + + // Modifying one shouldn't affect the other + (usage1[0] as LLMModelUsage).inputTokens = 9999; + expect((usage2[0] as LLMModelUsage).inputTokens).toBe(200); + }); + }); + + describe('handles missing metadata', () => { + it('should use empty strings when metadata is missing', () => { + const metrics: LLMMetrics = { + type: 'llm_metrics', + label: 'test', + requestId: 'req1', + timestamp: Date.now(), + durationMs: 100, + ttftMs: 50, + cancelled: false, + completionTokens: 100, + promptTokens: 200, + promptCachedTokens: 0, + totalTokens: 300, + tokensPerSecond: 10, + // No metadata + }; + + collector.collect(metrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(1); + + const llmUsage = usage[0] as LLMModelUsage; + expect(llmUsage.provider).toBe(''); + expect(llmUsage.model).toBe(''); + }); + }); + + describe('ignores VAD and EOU metrics', () => { + it('should not collect VAD metrics', () => { + const vadMetrics = { + type: 'vad_metrics' as const, + label: 'test', + timestamp: Date.now(), + idleTimeMs: 100, + inferenceDurationTotalMs: 50, + inferenceCount: 10, + }; + + collector.collect(vadMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(0); + }); + + it('should not collect EOU metrics', () => { + const eouMetrics = { + type: 'eou_metrics' as const, + timestamp: Date.now(), + endOfUtteranceDelayMs: 100, + transcriptionDelayMs: 50, + onUserTurnCompletedDelayMs: 30, + lastSpeakingTimeMs: Date.now(), + }; + + collector.collect(eouMetrics); + + const usage = collector.flatten(); + expect(usage).toHaveLength(0); + }); + }); + }); +}); diff --git a/agents/src/metrics/model_usage.ts b/agents/src/metrics/model_usage.ts new file mode 100644 index 000000000..5e723fb51 --- /dev/null +++ b/agents/src/metrics/model_usage.ts @@ -0,0 +1,262 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { + AgentMetrics, + InterruptionMetrics, + LLMMetrics, + RealtimeModelMetrics, + STTMetrics, + TTSMetrics, +} from './base.js'; + +export type LLMModelUsage = { + type: 'llm_usage'; + /** The provider name (e.g., 'openai', 'anthropic'). */ + provider: string; + /** The model name (e.g., 'gpt-4o', 'claude-3-5-sonnet'). */ + model: string; + /** Total input tokens. */ + inputTokens: number; + /** Input tokens served from cache. */ + inputCachedTokens: number; + /** Input audio tokens (for multimodal models). */ + inputAudioTokens: number; + /** Cached input audio tokens. */ + inputCachedAudioTokens: number; + /** Input text tokens. */ + inputTextTokens: number; + /** Cached input text tokens. */ + inputCachedTextTokens: number; + /** Input image tokens (for multimodal models). */ + inputImageTokens: number; + /** Cached input image tokens. */ + inputCachedImageTokens: number; + /** Total output tokens. */ + outputTokens: number; + /** Output audio tokens (for multimodal models). */ + outputAudioTokens: number; + /** Output text tokens. */ + outputTextTokens: number; + /** Total session connection duration in milliseconds (for session-based billing like xAI). */ + sessionDurationMs: number; +}; + +export type TTSModelUsage = { + type: 'tts_usage'; + /** The provider name (e.g., 'elevenlabs', 'cartesia'). */ + provider: string; + /** The model name (e.g., 'eleven_turbo_v2', 'sonic'). */ + model: string; + /** Input text tokens (for token-based TTS billing, e.g., OpenAI TTS). */ + inputTokens: number; + /** Output audio tokens (for token-based TTS billing, e.g., OpenAI TTS). */ + outputTokens: number; + /** Number of characters synthesized (for character-based TTS billing). */ + charactersCount: number; + /** + * Duration of generated audio in milliseconds. + */ + audioDurationMs: number; +}; + +export type STTModelUsage = { + type: 'stt_usage'; + /** The provider name (e.g., 'deepgram', 'assemblyai'). */ + provider: string; + /** The model name (e.g., 'nova-2', 'best'). */ + model: string; + /** Input audio tokens (for token-based STT billing). */ + inputTokens: number; + /** Output text tokens (for token-based STT billing). */ + outputTokens: number; + /** Duration of processed audio in milliseconds. */ + audioDurationMs: number; +}; + +export type InterruptionModelUsage = { + type: 'interruption_usage'; + /** The provider name (e.g., 'livekit'). */ + provider: string; + /** The model name (e.g., 'adaptive interruption'). */ + model: string; + /** Total number of requests sent. */ + totalRequests: number; +}; + +export type ModelUsage = LLMModelUsage | TTSModelUsage | STTModelUsage | InterruptionModelUsage; + +export function filterZeroValues(usage: T): Partial { + const result: Partial = {} as Partial; + for (const [key, value] of Object.entries(usage)) { + if (value !== 0 && value !== 0.0) { + (result as Record)[key] = value; + } + } + return result; +} + +export class ModelUsageCollector { + private llmUsage: Map = new Map(); + private ttsUsage: Map = new Map(); + private sttUsage: Map = new Map(); + + private interruptionUsage: Map = new Map(); + + /** Extract provider and model from metrics metadata. */ + private extractProviderModel( + metrics: LLMMetrics | STTMetrics | TTSMetrics | RealtimeModelMetrics | InterruptionMetrics, + ): [string, string] { + let provider = ''; + let model = ''; + if (metrics.metadata) { + provider = metrics.metadata.modelProvider || ''; + model = metrics.metadata.modelName || ''; + } + return [provider, model]; + } + + /** Get or create an LLMModelUsage for the given provider/model combination. */ + private getLLMUsage(provider: string, model: string): LLMModelUsage { + const key = `${provider}:${model}`; + let usage = this.llmUsage.get(key); + if (!usage) { + usage = { + type: 'llm_usage', + provider, + model, + inputTokens: 0, + inputCachedTokens: 0, + inputAudioTokens: 0, + inputCachedAudioTokens: 0, + inputTextTokens: 0, + inputCachedTextTokens: 0, + inputImageTokens: 0, + inputCachedImageTokens: 0, + outputTokens: 0, + outputAudioTokens: 0, + outputTextTokens: 0, + sessionDurationMs: 0, + }; + this.llmUsage.set(key, usage); + } + return usage; + } + + /** Get or create a TTSModelUsage for the given provider/model combination. */ + private getTTSUsage(provider: string, model: string): TTSModelUsage { + const key = `${provider}:${model}`; + let usage = this.ttsUsage.get(key); + if (!usage) { + usage = { + type: 'tts_usage', + provider, + model, + inputTokens: 0, + outputTokens: 0, + charactersCount: 0, + audioDurationMs: 0, + }; + this.ttsUsage.set(key, usage); + } + return usage; + } + + /** Get or create an STTModelUsage for the given provider/model combination. */ + private getSTTUsage(provider: string, model: string): STTModelUsage { + const key = `${provider}:${model}`; + let usage = this.sttUsage.get(key); + if (!usage) { + usage = { + type: 'stt_usage', + provider, + model, + inputTokens: 0, + outputTokens: 0, + audioDurationMs: 0, + }; + this.sttUsage.set(key, usage); + } + return usage; + } + + private getInterruptionUsage(provider: string, model: string): InterruptionModelUsage { + const key = `${provider}:${model}`; + let usage = this.interruptionUsage.get(key); + if (!usage) { + usage = { + type: 'interruption_usage', + provider, + model, + totalRequests: 0, + }; + this.interruptionUsage.set(key, usage); + } + return usage; + } + + /** Collect metrics and aggregate usage by model/provider. */ + collect(metrics: AgentMetrics): void { + if (metrics.type === 'llm_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getLLMUsage(provider, model); + usage.inputTokens += metrics.promptTokens; + usage.inputCachedTokens += metrics.promptCachedTokens; + usage.outputTokens += metrics.completionTokens; + } else if (metrics.type === 'realtime_model_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getLLMUsage(provider, model); + usage.inputTokens += metrics.inputTokens; + usage.inputCachedTokens += metrics.inputTokenDetails.cachedTokens; + + usage.inputTextTokens += metrics.inputTokenDetails.textTokens; + usage.inputCachedTextTokens += metrics.inputTokenDetails.cachedTokensDetails?.textTokens ?? 0; + usage.inputImageTokens += metrics.inputTokenDetails.imageTokens; + usage.inputCachedImageTokens += + metrics.inputTokenDetails.cachedTokensDetails?.imageTokens ?? 0; + usage.inputAudioTokens += metrics.inputTokenDetails.audioTokens; + usage.inputCachedAudioTokens += + metrics.inputTokenDetails.cachedTokensDetails?.audioTokens ?? 0; + + usage.outputTextTokens += metrics.outputTokenDetails.textTokens; + usage.outputAudioTokens += metrics.outputTokenDetails.audioTokens; + usage.outputTokens += metrics.outputTokens; + usage.sessionDurationMs += metrics.sessionDurationMs ?? 0; + } else if (metrics.type === 'tts_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const ttsUsage = this.getTTSUsage(provider, model); + ttsUsage.inputTokens += metrics.inputTokens ?? 0; + ttsUsage.outputTokens += metrics.outputTokens ?? 0; + ttsUsage.charactersCount += metrics.charactersCount; + ttsUsage.audioDurationMs += metrics.audioDurationMs; + } else if (metrics.type === 'stt_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const sttUsage = this.getSTTUsage(provider, model); + sttUsage.inputTokens += metrics.inputTokens ?? 0; + sttUsage.outputTokens += metrics.outputTokens ?? 0; + sttUsage.audioDurationMs += metrics.audioDurationMs; + } else if (metrics.type === 'interruption_metrics') { + const [provider, model] = this.extractProviderModel(metrics); + const usage = this.getInterruptionUsage(provider, model); + usage.totalRequests += metrics.numRequests; + } + // VAD and EOU metrics are not aggregated for usage tracking. + } + + flatten(): ModelUsage[] { + const result: ModelUsage[] = []; + for (const u of this.llmUsage.values()) { + result.push({ ...u }); + } + for (const u of this.ttsUsage.values()) { + result.push({ ...u }); + } + for (const u of this.sttUsage.values()) { + result.push({ ...u }); + } + for (const u of this.interruptionUsage.values()) { + result.push({ ...u }); + } + return result; + } +} diff --git a/agents/src/metrics/usage_collector.ts b/agents/src/metrics/usage_collector.ts index c7f0e6c3d..c815c8394 100644 --- a/agents/src/metrics/usage_collector.ts +++ b/agents/src/metrics/usage_collector.ts @@ -1,8 +1,13 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import { log } from '../log.js'; import type { AgentMetrics } from './base.js'; +/** + * @deprecated Use LLMModelUsage, TTSModelUsage, or STTModelUsage instead. + * These new types provide per-model/provider usage aggregation for more detailed tracking. + */ export interface UsageSummary { llmPromptTokens: number; llmPromptCachedTokens: number; @@ -11,10 +16,16 @@ export interface UsageSummary { sttAudioDurationMs: number; } +/** + * @deprecated Use ModelUsageCollector instead. + * ModelUsageCollector provides per-model/provider usage aggregation for more detailed tracking. + */ export class UsageCollector { private summary: UsageSummary; + private logger = log(); constructor() { + this.logger.warn('UsageCollector is deprecated. Use ModelUsageCollector instead.'); this.summary = { llmPromptTokens: 0, llmPromptCachedTokens: 0, diff --git a/agents/src/metrics/utils.ts b/agents/src/metrics/utils.ts index cf98f8d1d..ced021e63 100644 --- a/agents/src/metrics/utils.ts +++ b/agents/src/metrics/utils.ts @@ -60,5 +60,16 @@ export const logMetrics = (metrics: AgentMetrics) => { audioDurationMs: Math.round(metrics.audioDurationMs), }) .info('STT metrics'); + } else if (metrics.type === 'interruption_metrics') { + logger + .child({ + totalDurationMs: roundTwoDecimals(metrics.totalDuration), + predictionDurationMs: roundTwoDecimals(metrics.predictionDuration), + detectionDelayMs: roundTwoDecimals(metrics.detectionDelay), + numInterruptions: metrics.numInterruptions, + numBackchannels: metrics.numBackchannels, + numRequests: metrics.numRequests, + }) + .info('Interruption metrics'); } }; diff --git a/agents/src/stream/multi_input_stream.test.ts b/agents/src/stream/multi_input_stream.test.ts index cda78b62b..fb648ff32 100644 --- a/agents/src/stream/multi_input_stream.test.ts +++ b/agents/src/stream/multi_input_stream.test.ts @@ -2,7 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 import { ReadableStream } from 'node:stream/web'; -import { describe, expect, it } from 'vitest'; +import { beforeAll, describe, expect, it } from 'vitest'; +import { initializeLogger } from '../log.js'; import { delay } from '../utils.js'; import { MultiInputStream } from './multi_input_stream.js'; @@ -16,6 +17,10 @@ function streamFrom(values: T[]): ReadableStream { } describe('MultiInputStream', () => { + beforeAll(() => { + initializeLogger({ pretty: false }); + }); + // --------------------------------------------------------------------------- // Basic functionality // --------------------------------------------------------------------------- diff --git a/agents/src/stream/stream_channel.ts b/agents/src/stream/stream_channel.ts index 1fb68bab2..edaeaa856 100644 --- a/agents/src/stream/stream_channel.ts +++ b/agents/src/stream/stream_channel.ts @@ -4,14 +4,16 @@ import type { ReadableStream } from 'node:stream/web'; import { IdentityTransform } from './identity_transform.js'; -export interface StreamChannel { +export interface StreamChannel { write(chunk: T): Promise; close(): Promise; stream(): ReadableStream; + abort(error: E): Promise; readonly closed: boolean; + addStreamInput(stream: ReadableStream): void; } -export function createStreamChannel(): StreamChannel { +export function createStreamChannel(): StreamChannel { const transform = new IdentityTransform(); const writer = transform.writable.getWriter(); let isClosed = false; @@ -19,6 +21,36 @@ export function createStreamChannel(): StreamChannel { return { write: (chunk: T) => writer.write(chunk), stream: () => transform.readable, + abort: async (error: E) => { + if (isClosed) return; + isClosed = true; + try { + await writer.abort(error); + } catch (e) { + if (e instanceof Error && e.name === 'TypeError') return; + throw e; + } + }, + addStreamInput: (newInputStream) => { + if (isClosed) return; + const reader = newInputStream.getReader(); + (async () => { + try { + while (!isClosed) { + const { done, value } = await reader.read(); + if (done) break; + await writer.write(value); + } + } catch (err) { + if (!isClosed) { + isClosed = true; + await writer.abort(err as E); + } + } finally { + reader.releaseLock(); + } + })().catch(() => {}); + }, close: async () => { try { const result = await writer.close(); diff --git a/agents/src/stt/stt.ts b/agents/src/stt/stt.ts index b7a4a7ea1..72cdb4a26 100644 --- a/agents/src/stt/stt.ts +++ b/agents/src/stt/stt.ts @@ -67,6 +67,10 @@ export interface SpeechData { export interface RecognitionUsage { /** Duration of the audio that was recognized in seconds. */ audioDuration: number; + /** Input audio tokens (for token-based STT billing). */ + inputTokens?: number; + /** Output text tokens (for token-based STT billing). */ + outputTokens?: number; } /** SpeechEvent is a packet of speech-to-text data. */ @@ -129,6 +133,30 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter { const startTime = process.hrtime.bigint(); @@ -142,6 +170,10 @@ export abstract class STT extends (EventEmitter as new () => TypedEmitter durationMs: 0, label: this.#stt.label, audioDurationMs: Math.round(event.recognitionUsage!.audioDuration * 1000), + inputTokens: event.recognitionUsage!.inputTokens ?? 0, + outputTokens: event.recognitionUsage!.outputTokens ?? 0, streamed: true, + metadata: { + modelProvider: this.#stt.provider, + modelName: this.#stt.model, + }, }; this.#stt.emit('metrics_collected', metrics); } diff --git a/agents/src/telemetry/otel_http_exporter.ts b/agents/src/telemetry/otel_http_exporter.ts index ae6b6590e..43f01faea 100644 --- a/agents/src/telemetry/otel_http_exporter.ts +++ b/agents/src/telemetry/otel_http_exporter.ts @@ -58,6 +58,16 @@ export class SimpleOTLPHttpLogExporter { private readonly config: SimpleOTLPHttpLogExporterConfig; private jwt: string | null = null; + private static readonly FORCE_DOUBLE_KEYS = new Set([ + 'transcriptConfidence', + 'transcriptionDelay', + 'endOfTurnDelay', + 'onUserTurnCompletedDelay', + 'llmNodeTtft', + 'ttsNodeTtfb', + 'e2eLatency', + ]); + constructor(config: SimpleOTLPHttpLogExporterConfig) { this.config = config; } @@ -72,6 +82,7 @@ export class SimpleOTLPHttpLogExporter { const endpoint = `https://${this.config.cloudHostname}/observability/logs/otlp/v0`; const payload = this.buildPayload(records); + const payloadJson = JSON.stringify(payload); const response = await fetch(endpoint, { method: 'POST', @@ -79,7 +90,7 @@ export class SimpleOTLPHttpLogExporter { Authorization: `Bearer ${this.jwt}`, 'Content-Type': 'application/json', }, - body: JSON.stringify(payload), + body: payloadJson, }); if (!response.ok) { @@ -160,11 +171,11 @@ export class SimpleOTLPHttpLogExporter { ): Array<{ key: string; value: unknown }> { return Object.entries(attrs).map(([key, value]) => ({ key, - value: this.convertValue(value), + value: this.convertValue(value, key), })); } - private convertValue(value: unknown): unknown { + private convertValue(value: unknown, path: string = ''): unknown { if (value === null || value === undefined) { return { stringValue: '' }; } @@ -172,20 +183,32 @@ export class SimpleOTLPHttpLogExporter { return { stringValue: value }; } if (typeof value === 'number') { + const leafKey = + path + .split('.') + .pop() + ?.replace(/\[\d+\]$/, '') ?? path; + if (SimpleOTLPHttpLogExporter.FORCE_DOUBLE_KEYS.has(leafKey)) { + return { doubleValue: value }; + } return Number.isInteger(value) ? { intValue: String(value) } : { doubleValue: value }; } if (typeof value === 'boolean') { return { boolValue: value }; } if (Array.isArray(value)) { - return { arrayValue: { values: value.map((v) => this.convertValue(v)) } }; + return { + arrayValue: { + values: value.map((v, i) => this.convertValue(v, `${path}[${i}]`)), + }, + }; } if (typeof value === 'object') { return { kvlistValue: { values: Object.entries(value as Record).map(([k, v]) => ({ key: k, - value: this.convertValue(v), + value: this.convertValue(v, path ? `${path}.${k}` : k), })), }, }; diff --git a/agents/src/telemetry/trace_types.ts b/agents/src/telemetry/trace_types.ts index 1663bd75d..cc4d89443 100644 --- a/agents/src/telemetry/trace_types.ts +++ b/agents/src/telemetry/trace_types.ts @@ -33,6 +33,7 @@ export const ATTR_PROVIDER_TOOLS = 'lk.provider_tools'; export const ATTR_TOOL_SETS = 'lk.tool_sets'; export const ATTR_RESPONSE_TEXT = 'lk.response.text'; export const ATTR_RESPONSE_FUNCTION_CALLS = 'lk.response.function_calls'; +/** Time to first token in seconds. */ export const ATTR_RESPONSE_TTFT = 'lk.response.ttft'; // function tool @@ -46,6 +47,7 @@ export const ATTR_FUNCTION_TOOL_OUTPUT = 'lk.function_tool.output'; export const ATTR_TTS_INPUT_TEXT = 'lk.input_text'; export const ATTR_TTS_STREAMING = 'lk.tts.streaming'; export const ATTR_TTS_LABEL = 'lk.tts.label'; +/** Time to first byte in seconds. */ export const ATTR_RESPONSE_TTFB = 'lk.response.ttfb'; // eou detection @@ -58,18 +60,26 @@ export const ATTR_TRANSCRIPT_CONFIDENCE = 'lk.transcript_confidence'; export const ATTR_TRANSCRIPTION_DELAY = 'lk.transcription_delay'; export const ATTR_END_OF_TURN_DELAY = 'lk.end_of_turn_delay'; +// Adaptive Interruption attributes +export const ATTR_IS_INTERRUPTION = 'lk.is_interruption'; +export const ATTR_INTERRUPTION_PROBABILITY = 'lk.interruption.probability'; +export const ATTR_INTERRUPTION_TOTAL_DURATION = 'lk.interruption.total_duration'; +export const ATTR_INTERRUPTION_PREDICTION_DURATION = 'lk.interruption.prediction_duration'; +export const ATTR_INTERRUPTION_DETECTION_DELAY = 'lk.interruption.detection_delay'; + // metrics export const ATTR_LLM_METRICS = 'lk.llm_metrics'; export const ATTR_TTS_METRICS = 'lk.tts_metrics'; export const ATTR_REALTIME_MODEL_METRICS = 'lk.realtime_model_metrics'; -// latency span attributes +/** End-to-end latency in seconds. */ export const ATTR_E2E_LATENCY = 'lk.e2e_latency'; // OpenTelemetry GenAI attributes // OpenTelemetry specification: https://opentelemetry.io/docs/specs/semconv/registry/attributes/gen-ai/ export const ATTR_GEN_AI_OPERATION_NAME = 'gen_ai.operation.name'; export const ATTR_GEN_AI_REQUEST_MODEL = 'gen_ai.request.model'; +/** The provider name (e.g., 'openai', 'anthropic'). */ export const ATTR_GEN_AI_PROVIDER_NAME = 'gen_ai.provider.name'; export const ATTR_GEN_AI_USAGE_INPUT_TOKENS = 'gen_ai.usage.input_tokens'; export const ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = 'gen_ai.usage.output_tokens'; @@ -97,10 +107,3 @@ export const ATTR_EXCEPTION_MESSAGE = 'exception.message'; // Platform-specific attributes export const ATTR_LANGFUSE_COMPLETION_START_TIME = 'langfuse.observation.completion_start_time'; - -// Adaptive Interruption attributes -export const ATTR_IS_INTERRUPTION = 'lk.is_interruption'; -export const ATTR_INTERRUPTION_PROBABILITY = 'lk.interruption.probability'; -export const ATTR_INTERRUPTION_TOTAL_DURATION = 'lk.interruption.total_duration'; -export const ATTR_INTERRUPTION_PREDICTION_DURATION = 'lk.interruption.prediction_duration'; -export const ATTR_INTERRUPTION_DETECTION_DELAY = 'lk.interruption.detection_delay'; diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts index 28ef4c746..8ee52e586 100644 --- a/agents/src/telemetry/traces.ts +++ b/agents/src/telemetry/traces.ts @@ -22,8 +22,9 @@ import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; import FormData from 'form-data'; import { AccessToken } from 'livekit-server-sdk'; import fs from 'node:fs/promises'; -import type { ChatContent, ChatItem } from '../llm/index.js'; +import type { ChatContent, ChatItem, ChatRole } from '../llm/index.js'; import { enableOtelLogging } from '../log.js'; +import { filterZeroValues } from '../metrics/model_usage.js'; import type { SessionReport } from '../voice/report.js'; import { type SimpleLogRecord, SimpleOTLPHttpLogExporter } from './otel_http_exporter.js'; import { flushPinoLogs, initPinoCloudExporter } from './pino_otel_transport.js'; @@ -285,24 +286,80 @@ export async function flushOtelLogs(): Promise { await flushPinoLogs(); } +/** Proto-compatible role enum values. */ +type ProtoRole = 'DEVELOPER' | 'SYSTEM' | 'USER' | 'ASSISTANT'; + +const ROLE_MAP: Record = { + developer: 'DEVELOPER', + system: 'SYSTEM', + user: 'USER', + assistant: 'ASSISTANT', +}; + +interface ProtoMetricsReport { + startedSpeakingAt?: string; + stoppedSpeakingAt?: string; + transcriptionDelay?: number; + endOfTurnDelay?: number; + onUserTurnCompletedDelay?: number; + llmNodeTtft?: number; + ttsNodeTtfb?: number; + e2eLatency?: number; +} + +interface ProtoMessage { + id: string; + role: ProtoRole; + content: { text: ChatContent }[]; + createdAt: string; + interrupted?: boolean; + extra?: Record; + transcriptConfidence?: number; + metrics?: ProtoMetricsReport; +} + +interface ProtoFunctionCall { + id: string; + callId: string; + arguments: string | Record; + name: string; + createdAt: string; +} + +interface ProtoFunctionCallOutput { + id: string; + name: string; + callId: string; + output: string; + isError: boolean; + createdAt: string; +} + +interface ProtoAgentHandoff { + id: string; + newAgentId: string; + createdAt: string; + oldAgentId?: string; +} + +interface ProtoChatItem { + message?: ProtoMessage; + functionCall?: ProtoFunctionCall; + functionCallOutput?: ProtoFunctionCallOutput; + agentHandoff?: ProtoAgentHandoff; +} + /** * Convert ChatItem to proto-compatible dictionary format. * TODO: Use actual agent_session proto types once @livekit/protocol v1.43.1+ is published */ -function chatItemToProto(item: ChatItem): Record { - const itemDict: Record = {}; +function chatItemToProto(item: ChatItem): ProtoChatItem { + const itemDict: ProtoChatItem = {}; if (item.type === 'message') { - const roleMap: Record = { - developer: 'DEVELOPER', - system: 'SYSTEM', - user: 'USER', - assistant: 'ASSISTANT', - }; - - const msg: Record = { + const msg: ProtoMessage = { id: item.id, - role: roleMap[item.role] || item.role.toUpperCase(), + role: ROLE_MAP[item.role] ?? (item.role.toUpperCase() as ProtoRole), content: item.content.map((c: ChatContent) => ({ text: c })), createdAt: toRFC3339(item.createdAt), }; @@ -311,44 +368,43 @@ function chatItemToProto(item: ChatItem): Record { msg.interrupted = item.interrupted; } - // TODO(brian): Add extra and transcriptConfidence to ChatMessage - // if (item.extra && Object.keys(item.extra).length > 0) { - // msg.extra = item.extra; - // } - - // if (item.transcriptConfidence !== undefined && item.transcriptConfidence !== null) { - // msg.transcriptConfidence = item.transcriptConfidence; - // } - - // TODO(brian): Add metrics to ChatMessage - // const metrics = item.metrics || {}; - // if (Object.keys(metrics).length > 0) { - // msg.metrics = {}; - // if (metrics.started_speaking_at) { - // msg.metrics.startedSpeakingAt = toRFC3339(metrics.started_speaking_at); - // } - // if (metrics.stopped_speaking_at) { - // msg.metrics.stoppedSpeakingAt = toRFC3339(metrics.stopped_speaking_at); - // } - // if (metrics.transcription_delay !== undefined) { - // msg.metrics.transcriptionDelay = metrics.transcription_delay; - // } - // if (metrics.end_of_turn_delay !== undefined) { - // msg.metrics.endOfTurnDelay = metrics.end_of_turn_delay; - // } - // if (metrics.on_user_turn_completed_delay !== undefined) { - // msg.metrics.onUserTurnCompletedDelay = metrics.on_user_turn_completed_delay; - // } - // if (metrics.llm_node_ttft !== undefined) { - // msg.metrics.llmNodeTtft = metrics.llm_node_ttft; - // } - // if (metrics.tts_node_ttfb !== undefined) { - // msg.metrics.ttsNodeTtfb = metrics.tts_node_ttfb; - // } - // if (metrics.e2e_latency !== undefined) { - // msg.metrics.e2eLatency = metrics.e2e_latency; - // } - // } + if (item.extra && Object.keys(item.extra).length > 0) { + msg.extra = item.extra; + } + + if (item.transcriptConfidence !== undefined) { + msg.transcriptConfidence = item.transcriptConfidence; + } + + const metrics = item.metrics; + if (metrics && Object.keys(metrics).length > 0) { + const protoMetrics: ProtoMetricsReport = {}; + if (metrics.startedSpeakingAt !== undefined) { + protoMetrics.startedSpeakingAt = toRFC3339(metrics.startedSpeakingAt * 1000); + } + if (metrics.stoppedSpeakingAt !== undefined) { + protoMetrics.stoppedSpeakingAt = toRFC3339(metrics.stoppedSpeakingAt * 1000); + } + if (metrics.transcriptionDelay !== undefined) { + protoMetrics.transcriptionDelay = metrics.transcriptionDelay; + } + if (metrics.endOfTurnDelay !== undefined) { + protoMetrics.endOfTurnDelay = metrics.endOfTurnDelay; + } + if (metrics.onUserTurnCompletedDelay !== undefined) { + protoMetrics.onUserTurnCompletedDelay = metrics.onUserTurnCompletedDelay; + } + if (metrics.llmNodeTtft !== undefined) { + protoMetrics.llmNodeTtft = metrics.llmNodeTtft; + } + if (metrics.ttsNodeTtfb !== undefined) { + protoMetrics.ttsNodeTtfb = metrics.ttsNodeTtfb; + } + if (metrics.e2eLatency !== undefined) { + protoMetrics.e2eLatency = metrics.e2eLatency; + } + msg.metrics = protoMetrics; + } itemDict.message = msg; } else if (item.type === 'function_call') { @@ -369,7 +425,7 @@ function chatItemToProto(item: ChatItem): Record { createdAt: toRFC3339(item.createdAt), }; } else if (item.type === 'agent_handoff') { - const handoff: Record = { + const handoff: ProtoAgentHandoff = { id: item.id, newAgentId: item.newAgentId, createdAt: toRFC3339(item.createdAt), @@ -397,9 +453,7 @@ function chatItemToProto(item: ChatItem): Record { } /** - * Convert timestamp to RFC3339 format matching Python's _to_rfc3339. - * Note: TypeScript createdAt is in milliseconds (Date.now()), not seconds like Python. - * @internal + * Convert timestamp to RFC3339 format */ function toRFC3339(valueMs: number | Date): string { // valueMs is already in milliseconds (from Date.now()) @@ -445,6 +499,8 @@ export async function uploadSessionReport(options: { 'logger.name': 'chat_history', }; + const usage = report.modelUsage?.map(filterZeroValues) || null; + logRecords.push({ body: 'session report', timestampMs: report.startedAt || report.timestamp || 0, @@ -453,6 +509,7 @@ export async function uploadSessionReport(options: { 'session.options': report.options || {}, 'session.report_timestamp': report.timestamp, agent_name: agentName, + usage, }, }); diff --git a/agents/src/tts/tts.ts b/agents/src/tts/tts.ts index 8b8dcfda0..ab0477144 100644 --- a/agents/src/tts/tts.ts +++ b/agents/src/tts/tts.ts @@ -96,6 +96,30 @@ export abstract class TTS extends (EventEmitter as new () => TypedEmitter; #ttsRequestSpan?: Span; + #inputTokens = 0; + #outputTokens = 0; constructor(tts: TTS, connOptions: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS) { this.#tts = tts; @@ -284,6 +310,18 @@ export abstract class SynthesizeStream } } + /** + * Set token usage for token-based TTS billing (e.g., OpenAI TTS). + * Plugins should call this method to report token usage. + */ + protected setTokenUsage({ + inputTokens = 0, + outputTokens = 0, + }: { inputTokens?: number; outputTokens?: number } = {}): void { + this.#inputTokens = inputTokens; + this.#outputTokens = outputTokens; + } + protected async monitorMetrics() { const startTime = process.hrtime.bigint(); let audioDurationMs = 0; @@ -305,12 +343,22 @@ export abstract class SynthesizeStream audioDurationMs: roundedAudioDurationMs, cancelled: this.abortController.signal.aborted, label: this.#tts.label, - streamed: false, + inputTokens: this.#inputTokens, + outputTokens: this.#outputTokens, + streamed: true, + metadata: { + modelProvider: this.#tts.provider, + modelName: this.#tts.model, + }, }; if (this.#ttsRequestSpan) { this.#ttsRequestSpan.setAttribute(traceTypes.ATTR_TTS_METRICS, JSON.stringify(metrics)); } this.#tts.emit('metrics_collected', metrics); + + // Reset token usage after emitting metrics for the next segment + this.#inputTokens = 0; + this.#outputTokens = 0; } }; @@ -434,6 +482,8 @@ export abstract class ChunkedStream implements AsyncIterableIterator { const hostname = url.hostname; return hostname.endsWith('.livekit.cloud') || hostname.endsWith('.livekit.run'); }; + +/** + * Whether the agent is running in development mode (launched via `dev` or `connect`). + */ +export const isDevMode = (): boolean => { + return process.env.LIVEKIT_DEV_MODE === '1'; +}; + +/** + * Whether the agent is hosted on LiveKit Cloud. + */ +export const isHosted = (): boolean => { + return process.env.LIVEKIT_REMOTE_EOT_URL !== undefined; +}; diff --git a/agents/src/voice/agent.test.ts b/agents/src/voice/agent.test.ts index fd5f39183..8dd83fee3 100644 --- a/agents/src/voice/agent.test.ts +++ b/agents/src/voice/agent.test.ts @@ -7,7 +7,11 @@ import { tool } from '../llm/index.js'; import { initializeLogger } from '../log.js'; import { Task } from '../utils.js'; import { Agent, AgentTask, _setActivityTaskInfo } from './agent.js'; -import { agentActivityStorage } from './agent_activity.js'; +import { AgentActivity, agentActivityStorage } from './agent_activity.js'; +import { defaultEndpointingOptions } from './turn_config/endpointing.js'; +import { defaultInterruptionOptions } from './turn_config/interruption.js'; + +vi.mock('ofetch', () => ({ ofetch: vi.fn() })); initializeLogger({ pretty: false, level: 'error' }); @@ -215,4 +219,207 @@ describe('Agent', () => { await expect(wrapper.result).resolves.toBe('ok'); expect(closeOldActivity).toHaveBeenCalledTimes(1); }); + + describe('Agent constructor option migration', () => { + it('should set allowInterruptions to false via deprecated constructor field', () => { + const agent = new Agent({ instructions: 'test', allowInterruptions: false }); + expect(agent.turnHandling?.interruption?.enabled).toBe(false); + }); + + it('should not set derived properties when no compatibility fields are provided', () => { + const agent = new Agent({ instructions: 'test' }); + expect(agent.turnHandling).toBeUndefined(); + }); + + it('should expose minConsecutiveSpeechDelay', () => { + const agent = new Agent({ instructions: 'test', minConsecutiveSpeechDelay: 1.5 }); + expect(agent.minConsecutiveSpeechDelay).toBe(1.5); + }); + + it('should ignore deprecated constructor fields when turnHandling is provided', () => { + const agent = new Agent({ + instructions: 'test', + turnHandling: { + endpointing: { minDelay: 999 }, + interruption: {}, + turnDetection: 'vad', + }, + allowInterruptions: false, + }); + expect(agent.turnHandling?.endpointing?.minDelay).toBe(999); + expect(agent.turnHandling?.endpointing?.maxDelay).toBeUndefined(); + expect(agent.turnHandling?.interruption?.enabled).toBeUndefined(); + expect(agent.turnHandling?.turnDetection).toBe('vad'); + }); + + it('should let turnHandling override deprecated constructor fields on conflicts', () => { + const agent = new Agent({ + instructions: 'test', + turnHandling: { + endpointing: { minDelay: 999, maxDelay: 4000 }, + interruption: { enabled: true }, + turnDetection: 'vad', + }, + allowInterruptions: false, + turnDetection: 'stt', + }); + expect(agent.turnHandling?.endpointing?.minDelay).toBe(999); + expect(agent.turnHandling?.endpointing?.maxDelay).toBe(4000); + expect(agent.turnHandling?.interruption?.enabled).toBe(true); + expect(agent.turnHandling?.turnDetection).toBe('vad'); + }); + + it('should set interruptionDetection from turnHandling.interruption.mode', () => { + const agent = new Agent({ + instructions: 'test', + turnHandling: { + interruption: { mode: 'adaptive' }, + endpointing: {}, + turnDetection: undefined, + }, + }); + expect(agent.turnHandling?.interruption?.mode).toBe('adaptive'); + }); + + it('should let AgentActivity prefer agent-level overrides over session defaults', () => { + const agent = new Agent({ + instructions: 'test', + turnHandling: { + endpointing: { minDelay: 111, maxDelay: 222 }, + interruption: { enabled: false }, + turnDetection: 'manual', + }, + }); + const session = { + options: { + turnHandling: { + endpointing: defaultEndpointingOptions, + interruption: defaultInterruptionOptions, + }, + }, + turnDetection: 'stt', + useTtsAlignedTranscript: true, + vad: undefined, + stt: undefined, + llm: undefined, + tts: undefined, + interruptionDetection: undefined, + } as any; + + const activity = new AgentActivity(agent as any, session); + + expect(activity.allowInterruptions).toBe(false); + expect(activity.turnDetection).toBe('manual'); + expect(activity.turnHandling.endpointing?.minDelay).toBe(111); + expect(activity.turnHandling.endpointing?.maxDelay).toBe(222); + }); + + it('should disable adaptive interruption detection in default mode when prerequisites are missing', () => { + const previousRemoteEotUrl = process.env.LIVEKIT_REMOTE_EOT_URL; + process.env.LIVEKIT_REMOTE_EOT_URL = 'http://localhost:9999'; + + try { + const agent = new Agent({ instructions: 'test' }); + const session = { + options: { + turnHandling: { + endpointing: defaultEndpointingOptions, + interruption: defaultInterruptionOptions, + }, + }, + sessionOptions: { + turnHandling: { + endpointing: defaultEndpointingOptions, + interruption: defaultInterruptionOptions, + }, + }, + turnDetection: 'manual', + useTtsAlignedTranscript: true, + vad: {}, + stt: { + capabilities: { + alignedTranscript: true, + streaming: true, + }, + }, + llm: undefined, + tts: undefined, + interruptionDetection: undefined, + } as any; + + const activity = new AgentActivity(agent as any, session); + expect((activity as any).interruptionDetector).toBeUndefined(); + } finally { + if (previousRemoteEotUrl === undefined) { + delete process.env.LIVEKIT_REMOTE_EOT_URL; + } else { + process.env.LIVEKIT_REMOTE_EOT_URL = previousRemoteEotUrl; + } + } + }); + + it('should warn when session explicitly requests adaptive detection even if agent overrides it', () => { + const activity = Object.create(AgentActivity.prototype) as any; + activity.agent = { + turnHandling: { interruption: { mode: 'vad' } }, + turnDetection: undefined, + }; + activity.agentSession = { + interruptionDetection: 'adaptive', + turnDetection: 'manual', + }; + activity.logger = { warn: vi.fn() }; + + expect(activity.resolveInterruptionDetector()).toBeUndefined(); + expect(activity.logger.warn).toHaveBeenCalledWith( + "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled", + ); + }); + + it('should disable adaptive interruption detection when interruptions are disabled', () => { + const previousRemoteEotUrl = process.env.LIVEKIT_REMOTE_EOT_URL; + process.env.LIVEKIT_REMOTE_EOT_URL = 'http://localhost:9999'; + + try { + const activity = Object.create(AgentActivity.prototype) as any; + activity.agent = { + turnHandling: { + interruption: { enabled: false }, + }, + turnDetection: undefined, + stt: undefined, + vad: undefined, + llm: undefined, + }; + activity.agentSession = { + interruptionDetection: undefined, + turnDetection: 'stt', + sessionOptions: { + turnHandling: { + interruption: defaultInterruptionOptions, + endpointing: defaultEndpointingOptions, + }, + }, + stt: { + capabilities: { + alignedTranscript: true, + streaming: true, + }, + }, + vad: {}, + llm: undefined, + }; + activity.logger = { warn: vi.fn() }; + + expect(activity.resolveInterruptionDetector()).toBeUndefined(); + expect(activity.logger.warn).not.toHaveBeenCalled(); + } finally { + if (previousRemoteEotUrl === undefined) { + delete process.env.LIVEKIT_REMOTE_EOT_URL; + } else { + process.env.LIVEKIT_REMOTE_EOT_URL = previousRemoteEotUrl; + } + } + }); + }); }); diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index db937ecf7..3f83aee32 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -35,6 +35,8 @@ import { type AgentActivity, agentActivityStorage } from './agent_activity.js'; import type { AgentSession, TurnDetectionMode } from './agent_session.js'; import type { TimedString } from './io.js'; import type { SpeechHandle } from './speech_handle.js'; +import type { TurnHandlingOptions } from './turn_config/turn_handling.js'; +import { migrateTurnHandling } from './turn_config/utils.js'; export const functionCallStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>(); export const speechHandleStorage = new AsyncLocalStorage(); @@ -110,23 +112,28 @@ export interface AgentOptions { instructions: string; chatCtx?: ChatContext; tools?: ToolContext; - turnDetection?: TurnDetectionMode; stt?: STT | STTModelString; vad?: VAD; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; - allowInterruptions?: boolean; + turnHandling?: TurnHandlingOptions; minConsecutiveSpeechDelay?: number; useTtsAlignedTranscript?: boolean; + /** @deprecated use turnHandling.turnDetection instead */ + turnDetection?: TurnDetectionMode; + /** @deprecated use turnHandling.interruption.enabled instead */ + allowInterruptions?: boolean; } export class Agent { private _id: string; - private turnDetection?: TurnDetectionMode; private _stt?: STT; private _vad?: VAD; private _llm?: LLM | RealtimeModel; private _tts?: TTS; + private _turnHandling?: Partial; + + private _minConsecutiveSpeechDelay?: number; private _useTtsAlignedTranscript?: boolean; /** @internal */ @@ -151,12 +158,14 @@ export class Agent { vad, llm, tts, + allowInterruptions, + turnHandling, + minConsecutiveSpeechDelay, useTtsAlignedTranscript, }: AgentOptions) { if (id) { this._id = id; } else { - // Convert class name to snake_case const className = this.constructor.name; if (className === 'Agent') { this._id = 'default_agent'; @@ -176,7 +185,14 @@ export class Agent { }) : ChatContext.empty(); - this.turnDetection = turnDetection; + const resolvedTurnHandling = migrateTurnHandling({ + turnDetection, + allowInterruptions, + turnHandling, + }); + this._turnHandling = + Object.keys(resolvedTurnHandling).length > 0 ? resolvedTurnHandling : undefined; + this._vad = vad; if (typeof stt === 'string') { @@ -197,6 +213,7 @@ export class Agent { this._tts = tts; } + this._minConsecutiveSpeechDelay = minConsecutiveSpeechDelay; this._useTtsAlignedTranscript = useTtsAlignedTranscript; this._agentActivity = undefined; @@ -242,6 +259,14 @@ export class Agent { return this.getActivityOrThrow().agentSession as AgentSession; } + get turnHandling(): Partial | undefined { + return this._turnHandling; + } + + get minConsecutiveSpeechDelay(): number | undefined { + return this._minConsecutiveSpeechDelay; + } + async onEnter(): Promise {} async onExit(): Promise {} @@ -341,7 +366,8 @@ export class Agent { // Set startTimeOffset to provide linear timestamps across reconnections const audioInputStartedAt = - activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available + activity.inputStartedAt ?? // Use input started at proxied from AudioRecognition if available + activity.agentSession._recorderIO?.recordingStartedAt ?? // Fallback to recording start time if available activity.agentSession._startedAt ?? // Fallback to session start time Date.now(); // Fallback to current time diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 7392b536d..8b0fa6b26 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -8,7 +8,10 @@ import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api' import { Heap } from 'heap-js'; import { AsyncLocalStorage } from 'node:async_hooks'; import { ReadableStream, TransformStream } from 'node:stream/web'; -import { type ChatContext, ChatMessage } from '../llm/chat_context.js'; +import type { InterruptionDetectionError } from '../inference/interruption/errors.js'; +import { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; +import type { OverlappingSpeechEvent } from '../inference/interruption/types.js'; +import { type ChatContext, ChatMessage, type MetricsReport } from '../llm/chat_context.js'; import { type ChatItem, type FunctionCall, @@ -30,6 +33,7 @@ import { isSameToolChoice, isSameToolContext } from '../llm/tool_context.js'; import { log } from '../log.js'; import type { EOUMetrics, + InterruptionMetrics, LLMMetrics, RealtimeModelMetrics, STTMetrics, @@ -41,7 +45,7 @@ import { STT, type STTError, type SpeechEvent } from '../stt/stt.js'; import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js'; import { splitWords } from '../tokenize/basic/word.js'; import { TTS, type TTSError } from '../tts/tts.js'; -import { Future, Task, cancelAndWait, waitFor } from '../utils.js'; +import { Future, Task, cancelAndWait, isDevMode, isHosted, waitFor } from '../utils.js'; import { VAD, type VADEvent } from '../vad.js'; import type { Agent, ModelSettings } from './agent.js'; import { @@ -57,7 +61,6 @@ import { type EndOfTurnInfo, type PreemptiveGenerationInfo, type RecognitionHooks, - type _TurnDetector, } from './audio_recognition.js'; import { AgentSessionEventTypes, @@ -101,6 +104,7 @@ interface PreemptiveGeneration { createdAt: number; } +// TODO add false interruption handling and barge in handling for https://github.com/livekit/agents/pull/3109/changes export class AgentActivity implements RecognitionHooks { agent: Agent; agentSession: AgentSession; @@ -111,7 +115,7 @@ export class AgentActivity implements RecognitionHooks { private audioRecognition?: AudioRecognition; private realtimeSession?: RealtimeSession; private realtimeSpans?: Map; // Maps response_id to OTEL span for metrics recording - private turnDetectionMode?: Exclude; + private turnDetectionMode?: TurnDetectionMode; private logger = log(); private _schedulingPaused = true; private _drainBlockedTasks: Task[] = []; @@ -126,6 +130,51 @@ export class AgentActivity implements RecognitionHooks { // default to null as None, which maps to the default provider tool choice value private toolChoice: ToolChoice | null = null; private _preemptiveGeneration?: PreemptiveGeneration; + private interruptionDetector?: AdaptiveInterruptionDetector; + private isInterruptionDetectionEnabled: boolean; + private isInterruptionByAudioActivityEnabled: boolean; + private isDefaultInterruptionByAudioActivityEnabled: boolean; + + private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent): void => + this.onGenerationCreated(ev); + + private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent): void => + this.onInputSpeechStarted(ev); + + private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent): void => + this.onInputSpeechStopped(ev); + + private readonly onRealtimeInputAudioTranscriptionCompleted = ( + ev: InputTranscriptionCompleted, + ): void => this.onInputAudioTranscriptionCompleted(ev); + + private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError): void => + this.onError(ev); + + private readonly onInterruptionOverlappingSpeech = (ev: OverlappingSpeechEvent): void => { + this.agentSession.emit(AgentSessionEventTypes.OverlappingSpeech, ev); + }; + + private readonly onInterruptionMetricsCollected = (ev: InterruptionMetrics): void => { + this.agentSession._usageCollector.collect(ev); + this.agentSession.emit( + AgentSessionEventTypes.MetricsCollected, + createMetricsCollectedEvent({ metrics: ev }), + ); + }; + + private readonly onInterruptionError = (ev: InterruptionDetectionError): void => { + const errorEvent = createErrorEvent(ev, this.interruptionDetector); + this.agentSession.emit(AgentSessionEventTypes.Error, errorEvent); + + if (!ev.recoverable) { + this.agentSession._onError(ev); + this.fallbackToVadInterruption(); + return; + } + + this.agentSession._onError(ev); + }; /** @internal */ _mainTask?: Task; @@ -133,16 +182,6 @@ export class AgentActivity implements RecognitionHooks { _onExitTask?: Task; _userTurnCompletedTask?: Task; - private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) => - this.onGenerationCreated(ev); - private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) => - this.onInputSpeechStarted(ev); - private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) => - this.onInputSpeechStopped(ev); - private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) => - this.onInputAudioTranscriptionCompleted(ev); - private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) => - this.onError(ev); constructor(agent: Agent, agentSession: AgentSession) { this.agent = agent; this.agentSession = agentSession; @@ -235,6 +274,16 @@ export class AgentActivity implements RecognitionHooks { 'for more responsive interruption handling.', ); } + + this.interruptionDetector = this.resolveInterruptionDetector(); + this.isInterruptionDetectionEnabled = !!this.interruptionDetector; + + // this allows taking over audio interruption temporarily until interruption is detected + // by default is is ture unless turnDetection is manual or realtime_llm + this.isInterruptionByAudioActivityEnabled = + this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm'; + + this.isDefaultInterruptionByAudioActivityEnabled = this.isInterruptionByAudioActivityEnabled; } async start(): Promise { @@ -348,8 +397,13 @@ export class AgentActivity implements RecognitionHooks { vad: this.vad, turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, turnDetectionMode: this.turnDetectionMode, - minEndpointingDelay: this.agentSession.options.minEndpointingDelay, - maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, + interruptionDetection: this.interruptionDetector, + minEndpointingDelay: + this.agent.turnHandling?.endpointing?.minDelay ?? + this.agentSession.sessionOptions.turnHandling.endpointing.minDelay, + maxEndpointingDelay: + this.agent.turnHandling?.endpointing?.maxDelay ?? + this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, rootSpanContext: this.agentSession.rootSpanContext, sttModel: this.stt?.label, sttProvider: this.getSttProvider(), @@ -422,8 +476,10 @@ export class AgentActivity implements RecognitionHooks { } get allowInterruptions(): boolean { - // TODO(AJS-51): Allow options to be defined in Agent class - return this.agentSession.options.allowInterruptions; + return ( + this.agent.turnHandling?.interruption?.enabled ?? + this.agentSession.sessionOptions.turnHandling.interruption.enabled + ); } get useTtsAlignedTranscript(): boolean { @@ -432,14 +488,36 @@ export class AgentActivity implements RecognitionHooks { } get turnDetection(): TurnDetectionMode | undefined { - // TODO(brian): prioritize using agent.turn_detection - return this.agentSession.turnDetection; + return this.agent.turnHandling?.turnDetection ?? this.agentSession.turnDetection; + } + + get turnHandling() { + return this.agent.turnHandling ?? this.agentSession.sessionOptions.turnHandling; } + // get minEndpointingDelay(): number { + // return ( + // this.agent.turnHandling?.endpointing?.minDelay ?? + // this.agentSession.sessionOptions.turnHandling.endpointing.minDelay + // ); + // } + + // get maxEndpointingDelay(): number { + // return ( + // this.agent.turnHandling?.endpointing?.maxDelay ?? + // this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay + // ); + // } + get toolCtx(): ToolContext { return this.agent.toolCtx; } + /** @internal */ + get inputStartedAt() { + return this.audioRecognition?.inputStartedAt; + } + async updateChatCtx(chatCtx: ChatContext): Promise { chatCtx = chatCtx.copy({ toolCtx: this.toolCtx }); @@ -471,7 +549,13 @@ export class AgentActivity implements RecognitionHooks { } } - updateOptions({ toolChoice }: { toolChoice?: ToolChoice | null }): void { + updateOptions({ + toolChoice, + turnDetection, + }: { + toolChoice?: ToolChoice | null; + turnDetection?: TurnDetectionMode; + }): void { if (toolChoice !== undefined) { this.toolChoice = toolChoice; } @@ -479,6 +563,22 @@ export class AgentActivity implements RecognitionHooks { if (this.realtimeSession) { this.realtimeSession.updateOptions({ toolChoice: this.toolChoice }); } + + if (turnDetection !== undefined) { + this.turnDetectionMode = turnDetection; + this.isDefaultInterruptionByAudioActivityEnabled = + this.turnDetectionMode !== 'manual' && this.turnDetectionMode !== 'realtime_llm'; + + // sync live flag immediately when not speaking so the change takes effect right away + if (this.agentSession.agentState !== 'speaking') { + this.isInterruptionByAudioActivityEnabled = + this.isDefaultInterruptionByAudioActivityEnabled; + } + } + + if (this.audioRecognition) { + this.audioRecognition.updateOptions({ turnDetection: this.turnDetectionMode }); + } } attachAudioInput(audioStream: ReadableStream): void { @@ -629,6 +729,8 @@ export class AgentActivity implements RecognitionHooks { } } + this.agentSession._usageCollector.collect(ev); + this.agentSession.emit( AgentSessionEventTypes.MetricsCollected, createMetricsCollectedEvent({ metrics: ev }), @@ -660,6 +762,13 @@ export class AgentActivity implements RecognitionHooks { if (!this.vad) { this.agentSession._updateUserState('speaking'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfOverlapSpeech( + 0, + Date.now(), + this.agentSession._userSpeakingSpan, + ); + } } // this.interrupt() is going to raise when allow_interruptions is False, @@ -678,6 +787,9 @@ export class AgentActivity implements RecognitionHooks { this.logger.info(ev, 'onInputSpeechStopped'); if (!this.vad) { + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfOverlapSpeech(Date.now(), this.agentSession._userSpeakingSpan); + } this.agentSession._updateUserState('listening'); } @@ -751,17 +863,40 @@ export class AgentActivity implements RecognitionHooks { onStartOfSpeech(ev: VADEvent): void { let speechStartTime = Date.now(); if (ev) { - speechStartTime = speechStartTime - ev.speechDuration; + // Subtract both speechDuration and inferenceDuration to correct for VAD model latency. + speechStartTime = speechStartTime - ev.speechDuration - ev.inferenceDuration; + } + this.agentSession._updateUserState('speaking', { + lastSpeakingTime: speechStartTime, + otelContext: otelContext.active(), + }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + // Pass speechStartTime as the absolute startedAt timestamp. + this.audioRecognition.onStartOfOverlapSpeech( + ev.speechDuration, + speechStartTime, + this.agentSession._userSpeakingSpan, + ); } - this.agentSession._updateUserState('speaking', speechStartTime); } onEndOfSpeech(ev: VADEvent): void { let speechEndTime = Date.now(); if (ev) { - speechEndTime = speechEndTime - ev.silenceDuration; + // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency. + speechEndTime = speechEndTime - ev.silenceDuration - ev.inferenceDuration; + } + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + // Pass speechEndTime as the absolute endedAt timestamp. + this.audioRecognition.onEndOfOverlapSpeech( + speechEndTime, + this.agentSession._userSpeakingSpan, + ); } - this.agentSession._updateUserState('listening', speechEndTime); + this.agentSession._updateUserState('listening', { + lastSpeakingTime: speechEndTime, + otelContext: otelContext.active(), + }); } onVADInferenceDone(ev: VADEvent): void { @@ -770,12 +905,18 @@ export class AgentActivity implements RecognitionHooks { return; } - if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) { + if ( + ev.speechDuration >= this.agentSession.sessionOptions.turnHandling.interruption?.minDuration + ) { this.interruptByAudioActivity(); } } private interruptByAudioActivity(): void { + if (!this.isInterruptionByAudioActivityEnabled) { + return; + } + if (this.agentSession._aecWarmupRemaining > 0) { // Disable interruption from audio activity while AEC warmup is active. return; @@ -790,7 +931,11 @@ export class AgentActivity implements RecognitionHooks { // - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0 // - Apply check to all STT results: empty string, undefined, or any length // - This ensures consistent behavior across all interruption scenarios - if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) { + if ( + this.stt && + this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 && + this.audioRecognition + ) { const text = this.audioRecognition.currentTranscript; // TODO(shubhra): better word splitting for multi-language @@ -800,7 +945,7 @@ export class AgentActivity implements RecognitionHooks { // Only allow interruption if word count meets or exceeds minInterruptionWords // This applies to all cases: empty strings, partial speech, and full speech - if (wordCount < this.agentSession.options.minInterruptionWords) { + if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) { return; } } @@ -821,6 +966,14 @@ export class AgentActivity implements RecognitionHooks { } } + onInterruption(ev: OverlappingSpeechEvent) { + this.restoreInterruptionByAudioActivity(); + this.interruptByAudioActivity(); + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(ev.overlapStartedAt || ev.detectedAt); + } + } + onInterimTranscript(ev: SpeechEvent): void { if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) { // skip stt transcription if userTranscription is enabled on the realtime model @@ -875,7 +1028,7 @@ export class AgentActivity implements RecognitionHooks { onPreemptiveGeneration(info: PreemptiveGenerationInfo): void { if ( - !this.agentSession.options.preemptiveGeneration || + !this.agentSession.sessionOptions.preemptiveGeneration || this.schedulingPaused || (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) || !(this.llm instanceof LLM) @@ -896,6 +1049,7 @@ export class AgentActivity implements RecognitionHooks { const userMessage = ChatMessage.create({ role: 'user', content: info.newTranscript, + transcriptConfidence: info.transcriptConfidence, }); const chatCtx = this.agent.chatCtx.copy(); const speechHandle = this.generateReply({ @@ -991,16 +1145,17 @@ export class AgentActivity implements RecognitionHooks { this._currentSpeech && this._currentSpeech.allowInterruptions && !this._currentSpeech.interrupted && - this.agentSession.options.minInterruptionWords > 0 + this.agentSession.sessionOptions.turnHandling.interruption?.minWords > 0 ) { const wordCount = splitWords(info.newTranscript, true).length; - if (wordCount < this.agentSession.options.minInterruptionWords) { + if (wordCount < this.agentSession.sessionOptions.turnHandling.interruption?.minWords) { // avoid interruption if the new_transcript contains fewer words than minInterruptionWords this.cancelPreemptiveGeneration(); this.logger.info( { wordCount, - minInterruptionWords: this.agentSession.options.minInterruptionWords, + minInterruptionWords: + this.agentSession.sessionOptions.turnHandling.interruption.minWords, }, 'skipping user input, word count below minimum interruption threshold', ); @@ -1325,6 +1480,7 @@ export class AgentActivity implements RecognitionHooks { let userMessage: ChatMessage | undefined = ChatMessage.create({ role: 'user', content: info.newTranscript, + transcriptConfidence: info.transcriptConfidence, }); // create a temporary mutable chat context to pass to onUserTurnCompleted @@ -1351,6 +1507,24 @@ export class AgentActivity implements RecognitionHooks { return; } + const userMetricsReport: MetricsReport = {}; + if (info.startedSpeakingAt !== undefined) { + userMetricsReport.startedSpeakingAt = info.startedSpeakingAt / 1000; // ms -> seconds + } + if (info.stoppedSpeakingAt !== undefined) { + userMetricsReport.stoppedSpeakingAt = info.stoppedSpeakingAt / 1000; // ms -> seconds + } + if (info.transcriptionDelay !== undefined) { + userMetricsReport.transcriptionDelay = info.transcriptionDelay / 1000; // ms -> seconds + } + if (info.endOfUtteranceDelay !== undefined) { + userMetricsReport.endOfTurnDelay = info.endOfUtteranceDelay / 1000; // ms -> seconds + } + userMetricsReport.onUserTurnCompletedDelay = callbackDuration / 1000; // ms -> seconds + if (userMessage) { + userMessage.metrics = userMetricsReport; + } + let speechHandle: SpeechHandle | undefined; if (this._preemptiveGeneration !== undefined) { const preemptive = this._preemptiveGeneration; @@ -1363,6 +1537,14 @@ export class AgentActivity implements RecognitionHooks { isSameToolChoice(preemptive.toolChoice, this.toolChoice) ) { speechHandle = preemptive.speechHandle; + // The preemptive userMessage was created without metrics. + // Copy the metrics and transcriptConfidence from the new userMessage + // to the preemptive message BEFORE scheduling (so the pipeline inserts + // the message with metrics already set). + if (preemptive.userMessage && userMessage) { + preemptive.userMessage.metrics = userMetricsReport; + preemptive.userMessage.transcriptConfidence = userMessage.transcriptConfidence; + } this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL); this.logger.debug( { @@ -1456,11 +1638,19 @@ export class AgentActivity implements RecognitionHooks { tasks.push(textForwardTask); } + let replyStartedSpeakingAt: number | undefined; + let replyTtsGenData: _TTSGenerationData | null = null; + const onFirstFrame = (startedSpeakingAt?: number) => { + replyStartedSpeakingAt = startedSpeakingAt ?? Date.now(); this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(); + this.isInterruptionByAudioActivityEnabled = false; + } }; if (!audioOutput) { @@ -1478,8 +1668,11 @@ export class AgentActivity implements RecognitionHooks { audioSource, modelSettings, replyAbortController, + this.tts?.model, + this.tts?.provider, ); tasks.push(ttsTask); + replyTtsGenData = ttsGenData; const [forwardTask, _audioOut] = performAudioForwarding( ttsGenData.audioStream, @@ -1519,10 +1712,21 @@ export class AgentActivity implements RecognitionHooks { } if (addToChatCtx) { + const replyStoppedSpeakingAt = Date.now(); + const replyAssistantMetrics: MetricsReport = {}; + if (replyTtsGenData?.ttfb !== undefined) { + replyAssistantMetrics.ttsNodeTtfb = replyTtsGenData.ttfb; + } + if (replyStartedSpeakingAt !== undefined) { + replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds + replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds + } + const message = ChatMessage.create({ role: 'assistant', content: textOut?.text || '', interrupted: speechHandle.interrupted, + metrics: replyAssistantMetrics, }); this.agent._chatCtx.insert(message); this.agentSession._conversationItemAdded(message); @@ -1530,6 +1734,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + this.restoreInterruptionByAudioActivity(); } } @@ -1543,6 +1751,7 @@ export class AgentActivity implements RecognitionHooks { newMessage, toolsMessages, span, + _previousUserMetrics, }: { speechHandle: SpeechHandle; chatCtx: ChatContext; @@ -1553,6 +1762,7 @@ export class AgentActivity implements RecognitionHooks { newMessage?: ChatMessage; toolsMessages?: ChatItem[]; span: Span; + _previousUserMetrics?: MetricsReport; }): Promise => { speechHandle._agentTurnContext = otelContext.active(); @@ -1605,6 +1815,8 @@ export class AgentActivity implements RecognitionHooks { toolCtx, modelSettings, replyAbortController, + this.llm?.model, + this.llm?.provider, ); tasks.push(llmTask); @@ -1621,6 +1833,8 @@ export class AgentActivity implements RecognitionHooks { ttsTextInput, modelSettings, replyAbortController, + this.tts?.model, + this.tts?.provider, ); tasks.push(ttsTask); } else { @@ -1630,10 +1844,12 @@ export class AgentActivity implements RecognitionHooks { await speechHandle.waitIfNotInterrupted([speechHandle._waitForScheduled()]); + let userMetrics: MetricsReport | undefined = _previousUserMetrics; // Add new message to actual chat context if the speech is scheduled if (newMessage && speechHandle.scheduled) { this.agent._chatCtx.insert(newMessage); this.agentSession._conversationItemAdded(newMessage); + userMetrics = newMessage.metrics; } if (speechHandle.interrupted) { @@ -1679,11 +1895,17 @@ export class AgentActivity implements RecognitionHooks { textOut = _textOut; } + let agentStartedSpeakingAt: number | undefined; const onFirstFrame = (startedSpeakingAt?: number) => { + agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(); + this.isInterruptionByAudioActivityEnabled = false; + } }; let audioOut: _AudioOut | null = null; @@ -1740,6 +1962,29 @@ export class AgentActivity implements RecognitionHooks { await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]); } + const agentStoppedSpeakingAt = Date.now(); + const assistantMetrics: MetricsReport = {}; + + if (llmGenData.ttft !== undefined) { + assistantMetrics.llmNodeTtft = llmGenData.ttft; // already in seconds + } + if (ttsGenData?.ttfb !== undefined) { + assistantMetrics.ttsNodeTtfb = ttsGenData.ttfb; // already in seconds + } + if (agentStartedSpeakingAt !== undefined) { + assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds + assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds + + if (userMetrics?.stoppedSpeakingAt !== undefined) { + const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt; + assistantMetrics.e2eLatency = e2eLatency; + span.setAttribute(traceTypes.ATTR_E2E_LATENCY, e2eLatency); + } + } + + span.setAttribute(traceTypes.ATTR_SPEECH_INTERRUPTED, speechHandle.interrupted); + let hasSpeechMessage = false; + // add the tools messages that triggers this reply to the chat context if (toolsMessages) { for (const msg of toolsMessages) { @@ -1792,45 +2037,54 @@ export class AgentActivity implements RecognitionHooks { } if (forwardedText) { + hasSpeechMessage = true; const message = ChatMessage.create({ role: 'assistant', content: forwardedText, id: llmGenData.id, interrupted: true, createdAt: replyStartedAt, + metrics: assistantMetrics, }); chatCtx.insert(message); this.agent._chatCtx.insert(message); speechHandle._itemAdded([message]); this.agentSession._conversationItemAdded(message); + span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, forwardedText); } if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + this.restoreInterruptionByAudioActivity(); + } } this.logger.info( { speech_id: speechHandle.id, message: forwardedText }, 'playout completed with interrupt', ); - // TODO(shubhra) add chat message to speech handle speechHandle._markGenerationDone(); await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT); return; } if (textOut && textOut.text) { + hasSpeechMessage = true; const message = ChatMessage.create({ role: 'assistant', id: llmGenData.id, interrupted: false, createdAt: replyStartedAt, content: textOut.text, + metrics: assistantMetrics, }); chatCtx.insert(message); this.agent._chatCtx.insert(message); speechHandle._itemAdded([message]); this.agentSession._conversationItemAdded(message); + span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, textOut.text); this.logger.info( { speech_id: speechHandle.id, message: textOut.text }, 'playout completed without interruption', @@ -1841,6 +2095,12 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + this.restoreInterruptionByAudioActivity(); + } + } } // mark the playout done before waiting for the tool execution @@ -1850,7 +2110,7 @@ export class AgentActivity implements RecognitionHooks { if (toolOutput.output.length === 0) return; // important: no agent output should be used after this point - const { maxToolSteps } = this.agentSession.options; + const { maxToolSteps } = this.agentSession.sessionOptions; if (speechHandle.numSteps >= maxToolSteps) { this.logger.warn( { speech_id: speechHandle.id, max_tool_steps: maxToolSteps }, @@ -1900,6 +2160,7 @@ export class AgentActivity implements RecognitionHooks { instructions, undefined, toolMessages, + hasSpeechMessage ? undefined : userMetrics, ), ownedSpeechHandle: speechHandle, name: 'AgentActivity.pipelineReply', @@ -1933,6 +2194,7 @@ export class AgentActivity implements RecognitionHooks { instructions?: string, newMessage?: ChatMessage, toolsMessages?: ChatItem[], + _previousUserMetrics?: MetricsReport, ): Promise => tracer.startActiveSpan( async (span) => @@ -1946,6 +2208,7 @@ export class AgentActivity implements RecognitionHooks { newMessage, toolsMessages, span, + _previousUserMetrics, }), { name: 'agent_turn', @@ -2096,6 +2359,8 @@ export class AgentActivity implements RecognitionHooks { ttsTextInput, modelSettings, abortController, + this.tts?.model, + this.tts?.provider, ); tasks.push(ttsTask); realtimeAudioResult = ttsGenData.audioStream; @@ -2312,7 +2577,7 @@ export class AgentActivity implements RecognitionHooks { } // important: no agent ouput should be used after this point - const { maxToolSteps } = this.agentSession.options; + const { maxToolSteps } = this.agentSession.sessionOptions; if (speechHandle.numSteps >= maxToolSteps) { this.logger.warn( { speech_id: speechHandle.id, max_tool_steps: maxToolSteps }, @@ -2612,6 +2877,11 @@ export class AgentActivity implements RecognitionHooks { if (this._mainTask) { await this._mainTask.cancelAndWait(); } + if (this.interruptionDetector) { + this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech); + this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected); + this.interruptionDetector.off('error', this.onInterruptionError); + } this.agent._agentActivity = undefined; } finally { @@ -2619,6 +2889,95 @@ export class AgentActivity implements RecognitionHooks { } } + private resolveInterruptionDetector(): AdaptiveInterruptionDetector | undefined { + const agentInterruptionDetection = this.agent.turnHandling?.interruption?.mode; + const sessionInterruptionDetection = this.agentSession.interruptionDetection; + if ( + !( + this.stt && + this.stt.capabilities.alignedTranscript && + this.stt.capabilities.streaming && + this.vad && + this.turnDetection !== 'manual' && + this.turnDetection !== 'realtime_llm' && + !(this.llm instanceof RealtimeModel) + ) + ) { + if ( + agentInterruptionDetection === 'adaptive' || + sessionInterruptionDetection === 'adaptive' + ) { + this.logger.warn( + "interruptionDetection is provided, but it's not compatible with the current configuration and will be disabled", + ); + } + return undefined; + } + + if (!this.allowInterruptions) { + return undefined; + } + + if (agentInterruptionDetection === 'vad') { + return undefined; + } + + if (sessionInterruptionDetection === 'vad') { + return undefined; + } + + if ( + agentInterruptionDetection === undefined && + sessionInterruptionDetection === undefined && + !isHosted() && + !isDevMode() + ) { + this.logger.info('adaptive interruption is disabled by default in production mode'); + return undefined; + } + + try { + const detector = new AdaptiveInterruptionDetector(); + + detector.on('overlapping_speech', this.onInterruptionOverlappingSpeech); + detector.on('metrics_collected', this.onInterruptionMetricsCollected); + detector.on('error', this.onInterruptionError); + + return detector; + } catch (error: unknown) { + this.logger.warn({ error }, 'could not instantiate AdaptiveInterruptionDetector'); + } + return undefined; + } + + private restoreInterruptionByAudioActivity(): void { + this.isInterruptionByAudioActivityEnabled = this.isDefaultInterruptionByAudioActivityEnabled; + } + + private fallbackToVadInterruption(): void { + if (!this.isInterruptionDetectionEnabled) return; + + this.isInterruptionDetectionEnabled = false; + this.restoreInterruptionByAudioActivity(); + + if (this.interruptionDetector) { + this.interruptionDetector.off('overlapping_speech', this.onInterruptionOverlappingSpeech); + this.interruptionDetector.off('metrics_collected', this.onInterruptionMetricsCollected); + this.interruptionDetector.off('error', this.onInterruptionError); + this.interruptionDetector = undefined; + } + + if (this.audioRecognition) { + this.audioRecognition.disableInterruptionDetection().catch((err) => { + this.logger.warn({ err }, 'error while disabling interruption detection'); + }); + } + + this.logger.warn( + 'adaptive interruption disabled due to unrecoverable error, falling back to VAD-based interruption', + ); + } + private async _closeSessionResources(): Promise { // Unregister event handlers to prevent duplicate metrics if (this.llm instanceof LLM) { diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 74e1247bf..c88eda47e 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -17,12 +17,15 @@ import { type STTModelString, type TTSModelString, } from '../inference/index.js'; +import type { InterruptionDetectionError } from '../inference/interruption/errors.js'; +import type { OverlappingSpeechEvent } from '../inference/interruption/types.js'; import { type JobContext, getJobContext } from '../job.js'; import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js'; import type { LLMError } from '../llm/llm.js'; import { log } from '../log.js'; +import { type ModelUsage, ModelUsageCollector, filterZeroValues } from '../metrics/model_usage.js'; import type { STT } from '../stt/index.js'; import type { STTError } from '../stt/stt.js'; import { traceTypes, tracer } from '../telemetry/index.js'; @@ -61,39 +64,63 @@ import { } from './events.js'; import { AgentInput, AgentOutput } from './io.js'; import { RecorderIO } from './recorder_io/index.js'; -import { RoomIO, type RoomInputOptions, type RoomOutputOptions } from './room_io/index.js'; +import { RoomSessionTransport, SessionHost } from './remote_session.js'; +import { + DEFAULT_TEXT_INPUT_CALLBACK, + RoomIO, + type RoomInputOptions, + type RoomOutputOptions, +} from './room_io/index.js'; import type { UnknownUserData } from './run_context.js'; import type { SpeechHandle } from './speech_handle.js'; import { RunResult } from './testing/run_result.js'; +import type { InterruptionOptions } from './turn_config/interruption.js'; +import type { + InternalTurnHandlingOptions, + TurnHandlingOptions, +} from './turn_config/turn_handling.js'; +import { migrateLegacyOptions } from './turn_config/utils.js'; import { setParticipantSpanAttributes } from './utils.js'; -export interface VoiceOptions { - allowInterruptions: boolean; - discardAudioIfUninterruptible: boolean; - minInterruptionDuration: number; - minInterruptionWords: number; - minEndpointingDelay: number; - maxEndpointingDelay: number; - maxToolSteps: number; - preemptiveGeneration: boolean; - userAwayTimeout?: number | null; - aecWarmupDuration: number | null; +export interface AgentSessionUsage { + /** List of usage summaries, one per model/provider combination. */ + modelUsage: Array>; +} + +export interface InternalSessionOptions extends AgentSessionOptions { + turnHandling: InternalTurnHandlingOptions; useTtsAlignedTranscript: boolean; + maxToolSteps: number; + userAwayTimeout: number | null; } -const defaultVoiceOptions: VoiceOptions = { - allowInterruptions: true, - discardAudioIfUninterruptible: true, - minInterruptionDuration: 500, - minInterruptionWords: 0, - minEndpointingDelay: 500, - maxEndpointingDelay: 6000, +export const defaultAgentSessionOptions = { maxToolSteps: 3, - preemptiveGeneration: false, + preemptiveGeneration: true, userAwayTimeout: 15.0, aecWarmupDuration: 3000, + turnHandling: {}, useTtsAlignedTranscript: true, -} as const; +} as const satisfies AgentSessionOptions; + +/** @deprecated {@link VoiceOptions} has been flattened onto to {@link AgentSessionOptions} */ +export type VoiceOptions = { + maxToolSteps: number; + preemptiveGeneration: boolean; + userAwayTimeout?: number | null; + /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.mode instead. */ + allowInterruptions?: boolean; + /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.discardAudioIfUninterruptible instead. */ + discardAudioIfUninterruptible?: boolean; + /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.minDuration instead. */ + minInterruptionDuration?: number; + /** @deprecated Use {@link AgentSessionOptions.turnHandling}.interruption.minWords instead. */ + minInterruptionWords?: number; + /** @deprecated Use {@link AgentSessionOptions.turnHandling}.endpointing.minDelay instead. */ + minEndpointingDelay?: number; + /** @deprecated Use {@link AgentSessionOptions.turnHandling}.endpointing.maxDelay instead. */ + maxEndpointingDelay?: number; +}; export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector; @@ -107,17 +134,53 @@ export type AgentSessionCallbacks = { [AgentSessionEventTypes.SpeechCreated]: (ev: SpeechCreatedEvent) => void; [AgentSessionEventTypes.Error]: (ev: ErrorEvent) => void; [AgentSessionEventTypes.Close]: (ev: CloseEvent) => void; + [AgentSessionEventTypes.OverlappingSpeech]: (ev: OverlappingSpeechEvent) => void; }; export type AgentSessionOptions = { - turnDetection?: TurnDetectionMode; stt?: STT | STTModelString; vad?: VAD; llm?: LLM | RealtimeModel | LLMModels; tts?: TTS | TTSModelString; userData?: UserData; - voiceOptions?: Partial; connOptions?: SessionConnectOptions; + + /** @deprecated use turnHandling.turnDetection instead */ + turnDetection?: TurnDetectionMode; + /** @deprecated use top-level SessionOptions fields instead */ + voiceOptions?: Partial; + + maxToolSteps?: number; + /** + * Whether to speculatively begin LLM and TTS requests before an end-of-turn is detected. + * When `true`, the agent sends inference calls as soon as a user transcript is received rather + * than waiting for a definitive turn boundary. This can reduce response latency by overlapping + * model inference with user audio, but may incur extra compute if the user interrupts or + * revises mid-utterance. + * @defaultValue true + */ + preemptiveGeneration?: boolean; + + /** + * If set, set the user state as "away" after this amount of time after user and agent are + * silent. Set to `null` to disable. + * @defaultValue 15.0 + */ + userAwayTimeout?: number | null; + + /** + * Duration in milliseconds for AEC (Acoustic Echo Cancellation) warmup, during which + * interruptions from audio activity are suppressed. Set to `null` to disable. + * @defaultValue 3000 + */ + aecWarmupDuration?: number | null; + + /** + * Configuration for turn handling. + */ + turnHandling?: Partial; + + useTtsAlignedTranscript?: boolean; }; type ActivityTransitionOptions = { @@ -136,22 +199,23 @@ export class AgentSession< tts?: TTS; turnDetection?: TurnDetectionMode; + /** @deprecated use {@link sessionOptions } instead */ readonly options: VoiceOptions; + readonly sessionOptions: InternalSessionOptions; + + private readonly activityLock = new Mutex(); + private agent?: Agent; private activity?: AgentActivity; private nextActivity?: AgentActivity; private updateActivityTask?: Task; private started = false; - private userState: UserState = 'listening'; - private readonly activityLock = new Mutex(); - - /** @internal */ - _roomIO?: RoomIO; - private logger = log(); + private sessionHost?: SessionHost; private _chatCtx: ChatContext; private _userData: UserData | undefined; + private _userState: UserState = 'listening'; private _agentState: AgentState = 'initializing'; private _input: AgentInput; @@ -170,9 +234,16 @@ export class AgentSession< private ttsErrorCounts = 0; private sessionSpan?: Span; - private userSpeakingSpan?: Span; private agentSpeakingSpan?: Span; + private _interruptionDetection?: InterruptionOptions['mode']; + + /** @internal */ + _usageCollector: ModelUsageCollector = new ModelUsageCollector(); + + /** @internal */ + _roomIO?: RoomIO; + /** @internal */ _aecWarmupRemaining = 0; @@ -194,20 +265,18 @@ export class AgentSession< /** @internal - Current run state for testing */ _globalRunState?: RunResult; - constructor(opts: AgentSessionOptions) { + /** @internal */ + _userSpeakingSpan?: Span; + + private logger = log(); + + constructor(options: AgentSessionOptions) { super(); - const { - vad, - stt, - llm, - tts, - turnDetection, - userData, - voiceOptions = defaultVoiceOptions, - connOptions, - } = opts; + const { agentSessionOptions: opts, legacyVoiceOptions } = + migrateLegacyOptions(options); + const { vad, stt, llm, tts, userData, connOptions, ...resolvedSessionOptions } = opts; // Merge user-provided connOptions with defaults this._connOptions = { sttConnOptions: { ...DEFAULT_API_CONNECT_OPTIONS, ...connOptions?.sttConnOptions }, @@ -238,7 +307,8 @@ export class AgentSession< this.tts = tts; } - this.turnDetection = turnDetection; + this.turnDetection = resolvedSessionOptions.turnHandling.turnDetection; + this._interruptionDetection = resolvedSessionOptions.turnHandling.interruption?.mode; this._userData = userData; // configurable IO @@ -247,8 +317,9 @@ export class AgentSession< // This is the "global" chat context, it holds the entire conversation history this._chatCtx = ChatContext.empty(); - this.options = { ...defaultVoiceOptions, ...voiceOptions }; - this._aecWarmupRemaining = this.options.aecWarmupDuration ?? 0; + this.sessionOptions = resolvedSessionOptions; + this.options = legacyVoiceOptions; + this._aecWarmupRemaining = this.sessionOptions.aecWarmupDuration ?? 0; this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this); this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed); @@ -288,8 +359,20 @@ export class AgentSession< return this._connOptions; } + get interruptionDetection() { + return this._interruptionDetection; + } + + /** + * Returns usage summaries for this session, one per model/provider combination. + */ + get usage(): AgentSessionUsage { + // Skip zero fields for more concise usage display (matches python behavior). + return { modelUsage: this._usageCollector.flatten().map(filterZeroValues) }; + } + get useTtsAlignedTranscript(): boolean { - return this.options.useTtsAlignedTranscript; + return this.sessionOptions.useTtsAlignedTranscript; } set userData(value: UserData) { @@ -342,7 +425,17 @@ export class AgentSession< inputOptions, outputOptions, }); + this._roomIO.start(); + + const transport = new RoomSessionTransport(room, this._roomIO); + this.sessionHost = new SessionHost(transport); + this.sessionHost.registerSession(this); + if (inputOptions?.textEnabled !== false) { + this.sessionHost.registerTextInput( + inputOptions?.textInputCallback ?? DEFAULT_TEXT_INPUT_CALLBACK, + ); + } } let ctx: JobContext | undefined = undefined; @@ -385,6 +478,10 @@ export class AgentSession< await Promise.allSettled(tasks); + if (this.sessionHost) { + await this.sessionHost.start(); + } + // Log used IO configuration this.logger.debug( `using audio io: ${this.input.audio ? '`' + this.input.audio.constructor.name + '`' : '(none)'} -> \`AgentSession\` -> ${this.output.audio ? '`' + this.output.audio.constructor.name + '`' : '(none)'}`, @@ -416,6 +513,8 @@ export class AgentSession< return; } + this._usageCollector = new ModelUsageCollector(); + let ctx: JobContext | undefined = undefined; try { ctx = getJobContext(); @@ -748,6 +847,10 @@ export class AgentSession< return this._agentState; } + get userState(): UserState { + return this._userState; + } + get currentAgent(): Agent { if (!this.agent) { throw new Error('AgentSession is not running'); @@ -788,7 +891,9 @@ export class AgentSession< } /** @internal */ - _onError(error: RealtimeModelError | STTError | TTSError | LLMError): void { + _onError( + error: RealtimeModelError | STTError | TTSError | LLMError | InterruptionDetectionError, + ): void { if (this.closingTask || error.recoverable) { return; } @@ -804,9 +909,12 @@ export class AgentSession< if (this.ttsErrorCounts <= this._connOptions.maxUnrecoverableErrors) { return; } + } else if (error.type === 'interruption_detection_error') { + this.logger.error(error.toString()); + return; } - this.logger.error(error, 'AgentSession is closing due to unrecoverable error'); + this.logger.error(error, 'AgentSession is closing due to an unrecoverable error'); this.closingTask = (async () => { await this.closeImpl(CloseReason.ERROR, error); @@ -833,7 +941,6 @@ export class AgentSession< } if (state === 'speaking') { - // Reset error counts when agent starts speaking this.llmErrorCounts = 0; this.ttsErrorCounts = 0; @@ -867,7 +974,7 @@ export class AgentSession< this._agentState = state; // Handle user away timer based on state changes - if (state === 'listening' && this.userState === 'listening') { + if (state === 'listening' && this._userState === 'listening') { this._setUserAwayTimer(); } else { this._cancelUserAwayTimer(); @@ -880,29 +987,32 @@ export class AgentSession< } /** @internal */ - _updateUserState(state: UserState, lastSpeakingTime?: number) { - if (this.userState === state) { + _updateUserState( + state: UserState, + options?: { lastSpeakingTime?: number; otelContext?: Context }, + ) { + if (this._userState === state) { return; } - if (state === 'speaking' && this.userSpeakingSpan === undefined) { - this.userSpeakingSpan = tracer.startSpan({ + if (state === 'speaking' && this._userSpeakingSpan === undefined) { + this._userSpeakingSpan = tracer.startSpan({ name: 'user_speaking', - context: this.rootSpanContext, - startTime: lastSpeakingTime, + context: options?.otelContext ?? this.rootSpanContext, + startTime: options?.lastSpeakingTime, }); const linked = this._roomIO?.linkedParticipant; if (linked) { - setParticipantSpanAttributes(this.userSpeakingSpan, linked); + setParticipantSpanAttributes(this._userSpeakingSpan, linked); } - } else if (this.userSpeakingSpan !== undefined) { - this.userSpeakingSpan.end(lastSpeakingTime); - this.userSpeakingSpan = undefined; + } else if (this._userSpeakingSpan !== undefined) { + this._userSpeakingSpan.end(options?.lastSpeakingTime); + this._userSpeakingSpan = undefined; } - const oldState = this.userState; - this.userState = state; + const oldState = this._userState; + this._userState = state; // Handle user away timer based on state changes if (state === 'listening' && this._agentState === 'listening') { @@ -935,7 +1045,10 @@ export class AgentSession< private _setUserAwayTimer(): void { this._cancelUserAwayTimer(); - if (this.options.userAwayTimeout === null || this.options.userAwayTimeout === undefined) { + if ( + this.sessionOptions.userAwayTimeout === null || + this.sessionOptions.userAwayTimeout === undefined + ) { return; } @@ -946,7 +1059,7 @@ export class AgentSession< this.userAwayTimer = setTimeout(() => { this.logger.debug('User away timeout triggered'); this._updateUserState('away'); - }, this.options.userAwayTimeout * 1000); + }, this.sessionOptions.userAwayTimeout * 1000); } private _cancelUserAwayTimer(): void { @@ -970,7 +1083,7 @@ export class AgentSession< } private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void { - if (this.userState === 'away' && ev.isFinal) { + if (this._userState === 'away' && ev.isFinal) { this.logger.debug('User returned from away state due to speech input'); this._updateUserState('listening'); } @@ -978,7 +1091,13 @@ export class AgentSession< private async closeImpl( reason: ShutdownReason, - error: RealtimeModelError | LLMError | TTSError | STTError | null = null, + error: + | RealtimeModelError + | LLMError + | TTSError + | STTError + | InterruptionDetectionError + | null = null, drain: boolean = false, ): Promise { if (this.rootSpanContext) { @@ -992,7 +1111,13 @@ export class AgentSession< private async closeImplInner( reason: ShutdownReason, - error: RealtimeModelError | LLMError | TTSError | STTError | null = null, + error: + | RealtimeModelError + | LLMError + | TTSError + | STTError + | InterruptionDetectionError + | null = null, drain: boolean = false, ): Promise { if (!this.started) { @@ -1037,6 +1162,9 @@ export class AgentSession< this.output.audio = null; this.output.transcription = null; + await this.sessionHost?.close(); + this.sessionHost = undefined; + await this._roomIO?.close(); this._roomIO = undefined; @@ -1048,9 +1176,9 @@ export class AgentSession< this.sessionSpan = undefined; } - if (this.userSpeakingSpan) { - this.userSpeakingSpan.end(); - this.userSpeakingSpan = undefined; + if (this._userSpeakingSpan) { + this._userSpeakingSpan.end(); + this._userSpeakingSpan = undefined; } if (this.agentSpeakingSpan) { @@ -1062,7 +1190,7 @@ export class AgentSession< this.emit(AgentSessionEventTypes.Close, createCloseEvent(reason, error)); - this.userState = 'listening'; + this._userState = 'listening'; this._agentState = 'initializing'; this.rootSpanContext = undefined; this.llmErrorCounts = 0; diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 7c76eac37..781af70de 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -12,15 +12,25 @@ import { } from '@opentelemetry/api'; import type { WritableStreamDefaultWriter } from 'node:stream/web'; import { ReadableStream } from 'node:stream/web'; +import { isAPIError } from '../_exceptions.js'; +import { apiConnectDefaults, intervalForRetry } from '../inference/interruption/defaults.js'; +import { InterruptionDetectionError } from '../inference/interruption/errors.js'; +import type { AdaptiveInterruptionDetector } from '../inference/interruption/interruption_detector.js'; +import { InterruptionStreamSentinel } from '../inference/interruption/interruption_stream.js'; +import { + type InterruptionSentinel, + type OverlappingSpeechEvent, +} from '../inference/interruption/types.js'; import type { LanguageCode } from '../language.js'; import { type ChatContext } from '../llm/chat_context.js'; import { log } from '../log.js'; import { DeferredReadableStream, isStreamReaderReleaseError } from '../stream/deferred_stream.js'; import { IdentityTransform } from '../stream/identity_transform.js'; import { mergeReadableStreams } from '../stream/merge_readable_streams.js'; +import { type StreamChannel, createStreamChannel } from '../stream/stream_channel.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; import { traceTypes, tracer } from '../telemetry/index.js'; -import { Task, delay } from '../utils.js'; +import { Task, delay, waitForAbort } from '../utils.js'; import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; import type { STTNode } from './io.js'; @@ -47,6 +57,7 @@ export interface PreemptiveGenerationInfo { } export interface RecognitionHooks { + onInterruption: (ev: OverlappingSpeechEvent) => void; onStartOfSpeech: (ev: VADEvent) => void; onVADInferenceDone: (ev: VADEvent) => void; onEndOfSpeech: (ev: VADEvent) => void; @@ -59,9 +70,13 @@ export interface RecognitionHooks { } export interface _TurnDetector { + /** The model name used by this turn detector. */ + readonly model: string; + /** The provider name for this turn detector. */ + readonly provider: string; unlikelyThreshold: (language?: LanguageCode) => Promise; supportsLanguage: (language?: LanguageCode) => Promise; - predictEndOfTurn(chatCtx: ChatContext): Promise; + predictEndOfTurn(chatCtx: ChatContext, timeout?: number): Promise; } export interface AudioRecognitionOptions { @@ -74,7 +89,8 @@ export interface AudioRecognitionOptions { /** Turn detector for end-of-turn prediction. */ turnDetector?: _TurnDetector; /** Turn detection mode. */ - turnDetectionMode?: Exclude; + turnDetectionMode?: TurnDetectionMode; + interruptionDetection?: AdaptiveInterruptionDetector; /** Minimum endpointing delay in milliseconds. */ minEndpointingDelay: number; /** Maximum endpointing delay in milliseconds. */ @@ -99,12 +115,13 @@ export interface ParticipantLike { kind: ParticipantKind; } +// TODO add ability to update stt/vad/interruption-detection export class AudioRecognition { private hooks: RecognitionHooks; private stt?: STTNode; private vad?: VAD; private turnDetector?: _TurnDetector; - private turnDetectionMode?: Exclude; + private turnDetectionMode?: TurnDetectionMode; private minEndpointingDelay: number; private maxEndpointingDelay: number; private lastLanguage?: LanguageCode; @@ -138,6 +155,16 @@ export class AudioRecognition { private commitUserTurnTask?: Task; private vadTask?: Task; private sttTask?: Task; + private interruptionTask?: Task; + + // interruption detection + private interruptionDetection?: AdaptiveInterruptionDetector; + private _inputStartedAt?: number; + private ignoreUserTranscriptUntil?: number; + private transcriptBuffer: SpeechEvent[]; + private isInterruptionEnabled: boolean; + private isAgentSpeaking: boolean; + private interruptionStreamChannel?: StreamChannel; constructor(opts: AudioRecognitionOptions) { this.hooks = opts.recognitionHooks; @@ -154,9 +181,29 @@ export class AudioRecognition { this.getLinkedParticipant = opts.getLinkedParticipant; this.deferredInputStream = new DeferredReadableStream(); - const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee(); - this.vadInputStream = vadInputStream; - this.sttInputStream = mergeReadableStreams(sttInputStream, this.silenceAudioTransform.readable); + this.interruptionDetection = opts.interruptionDetection; + this.transcriptBuffer = []; + this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad); + this.isAgentSpeaking = false; + + if (opts.interruptionDetection) { + const [vadInputStream, teedInput] = this.deferredInputStream.stream.tee(); + const [inputStream, sttInputStream] = teedInput.tee(); + this.vadInputStream = vadInputStream; + this.sttInputStream = mergeReadableStreams( + sttInputStream, + this.silenceAudioTransform.readable, + ); + this.interruptionStreamChannel = createStreamChannel(); + this.interruptionStreamChannel.addStreamInput(inputStream); + } else { + const [vadInputStream, sttInputStream] = this.deferredInputStream.stream.tee(); + this.vadInputStream = vadInputStream; + this.sttInputStream = mergeReadableStreams( + sttInputStream, + this.silenceAudioTransform.readable, + ); + } this.silenceAudioWriter = this.silenceAudioTransform.writable.getWriter(); } @@ -170,6 +217,16 @@ export class AudioRecognition { return this.audioTranscript; } + /** @internal */ + get inputStartedAt() { + return this._inputStartedAt; + } + + /** @internal */ + updateOptions(options: { turnDetection: TurnDetectionMode | undefined }): void { + this.turnDetectionMode = options.turnDetection; + } + async start() { this.vadTask = Task.from(({ signal }) => this.createVadTask(this.vad, signal)); this.vadTask.result.catch((err) => { @@ -180,6 +237,220 @@ export class AudioRecognition { this.sttTask.result.catch((err) => { this.logger.error(`Error running STT task: ${err}`); }); + + this.interruptionTask = Task.from(({ signal }) => + this.createInterruptionTask(this.interruptionDetection, signal), + ); + this.interruptionTask.result.catch((err) => { + this.logger.error(`Error running interruption task: ${err}`); + }); + } + + async stop() { + await this.sttTask?.cancelAndWait(); + await this.vadTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); + } + + async disableInterruptionDetection(): Promise { + this.isInterruptionEnabled = false; + this.interruptionDetection = undefined; + await this.interruptionTask?.cancelAndWait(); + this.interruptionTask = undefined; + await this.interruptionStreamChannel?.close(); + this.interruptionStreamChannel = undefined; + } + + async onStartOfAgentSpeech() { + this.isAgentSpeaking = true; + return this.trySendInterruptionSentinel(InterruptionStreamSentinel.agentSpeechStarted()); + } + + async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { + if (!this.isInterruptionEnabled) { + this.isAgentSpeaking = false; + return; + } + + const inputOpen = await this.trySendInterruptionSentinel( + InterruptionStreamSentinel.agentSpeechEnded(), + ); + if (!inputOpen) { + this.isAgentSpeaking = false; + return; + } + + if (this.isAgentSpeaking) { + if (this.ignoreUserTranscriptUntil === undefined) { + this.onEndOfOverlapSpeech(Date.now()); + } + this.ignoreUserTranscriptUntil = this.ignoreUserTranscriptUntil + ? Math.min(ignoreUserTranscriptUntil, this.ignoreUserTranscriptUntil) + : ignoreUserTranscriptUntil; + + // flush held transcripts if possible + await this.flushHeldTranscripts(); + } + this.isAgentSpeaking = false; + } + + /** Start interruption inference when agent is speaking and overlap speech starts. */ + async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) { + if (this.isAgentSpeaking) { + this.trySendInterruptionSentinel( + InterruptionStreamSentinel.overlapSpeechStarted( + speechDuration, + startedAt, + userSpeakingSpan, + ), + ); + } + } + + /** End interruption inference when overlap speech ends. */ + async onEndOfOverlapSpeech(endedAt: number, userSpeakingSpan?: Span) { + if (!this.isInterruptionEnabled) { + return; + } + if (userSpeakingSpan && userSpeakingSpan.isRecording()) { + userSpeakingSpan.setAttribute(traceTypes.ATTR_IS_INTERRUPTION, 'false'); + } + + return this.trySendInterruptionSentinel(InterruptionStreamSentinel.overlapSpeechEnded(endedAt)); + } + + /** + * Flush held transcripts whose *end time* is after the ignoreUserTranscriptUntil timestamp. + * If the event has no timestamps, we assume it is the same as the next valid event. + */ + private async flushHeldTranscripts() { + if ( + !this.isInterruptionEnabled || + this.ignoreUserTranscriptUntil === undefined || + this.transcriptBuffer.length === 0 + ) { + return; + } + + if (!this._inputStartedAt) { + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + return; + } + + let emitFromIndex: number | null = null; + let shouldFlush = false; + + for (let i = 0; i < this.transcriptBuffer.length; i++) { + const ev = this.transcriptBuffer[i]; + if (!ev || !ev.alternatives || ev.alternatives.length === 0) { + emitFromIndex = Math.min(emitFromIndex ?? i, i); + continue; + } + const firstAlternative = ev.alternatives[0]; + if ( + firstAlternative.startTime === firstAlternative.endTime && + firstAlternative.startTime === 0 + ) { + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + return; + } + + if (this.#alternativeEndsBeforeIgnoreWindow(firstAlternative)) { + emitFromIndex = null; + } else { + emitFromIndex = Math.min(emitFromIndex ?? i, i); + shouldFlush = true; + break; + } + } + + const eventsToEmit = + emitFromIndex !== null && shouldFlush ? this.transcriptBuffer.slice(emitFromIndex) : []; + + this.transcriptBuffer = []; + this.ignoreUserTranscriptUntil = undefined; + + for (const event of eventsToEmit) { + this.logger.trace( + { + event: event.type, + }, + 're-emitting held user transcript', + ); + this.onSTTEvent(event); + } + } + + #alternativeEndsBeforeIgnoreWindow( + alternative: NonNullable[number], + ): boolean { + if ( + this.ignoreUserTranscriptUntil === undefined || + !this._inputStartedAt || + alternative.startTime <= 0 + ) { + return false; + } + + // `SpeechData.startTime` is in seconds relative to audio start, while `inputStartedAt` and + // `ignoreUserTranscriptUntil` are epoch milliseconds. + return alternative.startTime * 1000 + this._inputStartedAt < this.ignoreUserTranscriptUntil; + } + + private shouldHoldSttEvent(ev: SpeechEvent): boolean { + if (!this.isInterruptionEnabled) { + return false; + } + if (this.isAgentSpeaking) { + return true; + } + + // reset when the user starts speaking after the agent speech + if (ev.type === SpeechEventType.START_OF_SPEECH) { + this.ignoreUserTranscriptUntil = undefined; + this.transcriptBuffer = []; + return false; + } + + if (this.ignoreUserTranscriptUntil === undefined) { + return false; + } + // sentinel events are always held until we have something concrete to release them + if (!ev.alternatives || ev.alternatives.length === 0) { + return true; + } + + const alternative = ev.alternatives[0]; + + if ( + alternative.startTime !== alternative.endTime && + this.#alternativeEndsBeforeIgnoreWindow(alternative) + ) { + return true; + } + return false; + } + + private async trySendInterruptionSentinel( + frame: AudioFrame | InterruptionSentinel, + ): Promise { + if ( + this.isInterruptionEnabled && + this.interruptionStreamChannel && + !this.interruptionStreamChannel.closed + ) { + try { + await this.interruptionStreamChannel.write(frame); + return true; + } catch (e: unknown) { + this.logger.warn( + `could not forward interruption sentinel: ${e instanceof Error ? e.message : String(e)}`, + ); + } + } + return false; } private ensureUserTurnSpan(startTime?: number): Span { @@ -235,6 +506,25 @@ export class AudioRecognition { return; } + // handle interruption detection + // - hold the event until the ignore_user_transcript_until expires + // - release only relevant events + // - allow RECOGNITION_USAGE to pass through immediately + + if (ev.type !== SpeechEventType.RECOGNITION_USAGE && this.isInterruptionEnabled) { + if (this.shouldHoldSttEvent(ev)) { + this.logger.trace( + { event: ev.type, ignoreUserTranscriptUntil: this.ignoreUserTranscriptUntil }, + 'holding STT event until ignore_user_transcript_until expires', + ); + this.transcriptBuffer.push(ev); + return; + } else { + await this.flushHeldTranscripts(); + // no return here to allow the new event to be processed normally + } + } + switch (ev.type) { case SpeechEventType.FINAL_TRANSCRIPT: const transcript = ev.alternatives?.[0]?.text; @@ -418,6 +708,12 @@ export class AudioRecognition { } } + private onOverlapSpeechEvent(ev: OverlappingSpeechEvent) { + if (ev.isInterruption) { + this.hooks.onInterruption(ev); + } + } + private runEOUDetection(chatCtx: ChatContext) { this.logger.debug( { @@ -676,7 +972,9 @@ export class AudioRecognition { this.lastSpeakingTime = Date.now(); if (this.speechStartTime === undefined) { - this.speechStartTime = Date.now(); + // Backdate speechStartTime to the actual start of accumulated speech. + // ev.rawAccumulatedSpeech is in ms (VADEvent durations are all ms in TS). + this.speechStartTime = Date.now() - ev.rawAccumulatedSpeech; } } break; @@ -708,6 +1006,136 @@ export class AudioRecognition { } } + private async createInterruptionTask( + interruptionDetection: AdaptiveInterruptionDetector | undefined, + signal: AbortSignal, + ) { + if (!interruptionDetection || !this.interruptionStreamChannel) return; + + let numRetries = 0; + const maxRetries = apiConnectDefaults.maxRetries; + + while (!signal.aborted) { + const stream = interruptionDetection.createStream(); + const eventReader = stream.stream().getReader(); + + const cleanup = async () => { + try { + signal.removeEventListener('abort', cleanup); + eventReader.releaseLock(); + await stream.close(); + } catch (e) { + this.logger.debug('createInterruptionTask: error during cleanup:', e); + } + }; + + signal.addEventListener('abort', cleanup, { once: true }); + + let forwardTask: Promise | undefined; + + try { + // Unlike Python where _agent_speech_started lives on `self` and survives retries, + // JS creates a fresh InterruptionStreamBase per retry with agentSpeechStarted = false. + // Re-inject the sentinel so the new stream knows the agent is mid-speech. + if (numRetries > 0 && this.isAgentSpeaking) { + await stream.pushFrame(InterruptionStreamSentinel.agentSpeechStarted()); + } + + forwardTask = (async () => { + const inputReader = this.interruptionStreamChannel!.stream().getReader(); + const abortPromise = waitForAbort(signal); + + try { + while (!signal.aborted) { + const res = await Promise.race([inputReader.read(), abortPromise]); + if (!res) break; + + const { value, done } = res; + if (done) break; + + if (value instanceof AudioFrame) { + const frameDurationMs = (value.samplesPerChannel / value.sampleRate) * 1000; + this._inputStartedAt ??= Date.now() - frameDurationMs; + } else { + this._inputStartedAt ??= Date.now(); + } + + await stream.pushFrame(value); + } + } finally { + inputReader.releaseLock(); + } + })(); + + const abortPromise = waitForAbort(signal); + + while (!signal.aborted) { + const res = await Promise.race([eventReader.read(), abortPromise]); + if (!res) break; + const { done, value: ev } = res; + if (done) break; + this.onOverlapSpeechEvent(ev); + } + break; + } catch (e) { + if (signal.aborted) break; + + if (isAPIError(e)) { + if (maxRetries === 0 || !e.retryable) { + interruptionDetection.emitError( + new InterruptionDetectionError( + e.message, + Date.now(), + interruptionDetection.label, + false, + ), + ); + break; + } else if (numRetries >= maxRetries) { + interruptionDetection.emitError( + new InterruptionDetectionError( + `failed to detect interruption after ${numRetries} attempts`, + Date.now(), + interruptionDetection.label, + false, + ), + ); + break; + } else { + const retryInterval = intervalForRetry(numRetries); + interruptionDetection.emitError( + new InterruptionDetectionError( + e.message, + Date.now(), + interruptionDetection.label, + true, + ), + ); + this.logger.warn( + { model: interruptionDetection.label, attempt: numRetries }, + `failed to detect interruption, retrying in ${retryInterval}ms`, + ); + numRetries++; + await delay(retryInterval, { signal }); + } + } else { + const msg = e instanceof Error ? e.message : String(e); + interruptionDetection.emitError( + new InterruptionDetectionError(msg, Date.now(), interruptionDetection.label, false), + ); + this.logger.error(e, 'Error in interruption task'); + break; + } + } finally { + await cleanup(); + await forwardTask?.catch((e) => { + this.logger.debug({ err: e }, 'interruption task exited with error'); + }); + } + } + this.logger.debug('Interruption task closed'); + } + setInputAudioStream(audioStream: ReadableStream) { this.deferredInputStream.setSource(audioStream); } @@ -784,6 +1212,8 @@ export class AudioRecognition { await this.sttTask?.cancelAndWait(); await this.vadTask?.cancelAndWait(); await this.bounceEOUTask?.cancelAndWait(); + await this.interruptionTask?.cancelAndWait(); + await this.interruptionStreamChannel?.close(); } private _endUserTurnSpan({ @@ -810,6 +1240,14 @@ export class AudioRecognition { } private get vadBaseTurnDetection() { - return ['vad', undefined].includes(this.turnDetectionMode); + if (typeof this.turnDetectionMode === 'object') { + return false; + } + + if (this.turnDetectionMode === undefined || this.turnDetectionMode === 'vad') { + return true; + } + + return false; } } diff --git a/agents/src/voice/audio_recognition_span.test.ts b/agents/src/voice/audio_recognition_span.test.ts index 9e9eb521c..cfe92a821 100644 --- a/agents/src/voice/audio_recognition_span.test.ts +++ b/agents/src/voice/audio_recognition_span.test.ts @@ -2,14 +2,27 @@ // // SPDX-License-Identifier: Apache-2.0 import { ParticipantKind } from '@livekit/rtc-node'; -import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api'; +import { + InMemorySpanExporter, + type ReadableSpan, + SimpleSpanProcessor, +} from '@opentelemetry/sdk-trace-base'; import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import { ReadableStream } from 'node:stream/web'; import { describe, expect, it, vi } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; import { initializeLogger } from '../log.js'; import { type SpeechEvent, SpeechEventType } from '../stt/stt.js'; -import { setTracerProvider } from '../telemetry/index.js'; +import { setTracerProvider, tracer } from '../telemetry/index.js'; import { VAD, type VADEvent, VADEventType, type VADStream } from '../vad.js'; -import { AudioRecognition, type _TurnDetector } from './audio_recognition.js'; +import { AgentSession } from './agent_session.js'; +import { + AudioRecognition, + type RecognitionHooks, + type _TurnDetector, +} from './audio_recognition.js'; +import type { STTNode } from './io.js'; function setupInMemoryTracing() { const exporter = new InMemorySpanExporter(); @@ -20,10 +33,25 @@ function setupInMemoryTracing() { return { exporter }; } -function spanByName(spans: any[], name: string) { +function spanByName(spans: ReadableSpan[], name: string) { return spans.find((s) => s.name === name); } +function createFakeSession(rootSpanContext = ROOT_CONTEXT): AgentSession { + return { + _agentState: 'listening', + _roomIO: { + linkedParticipant: { sid: 'p3', identity: 'charlie', kind: ParticipantKind.AGENT }, + }, + _setUserAwayTimer: vi.fn(), + _cancelUserAwayTimer: vi.fn(), + _userSpeakingSpan: undefined, + _userState: 'listening', + emit: vi.fn(), + rootSpanContext, + } as unknown as AgentSession; +} + class FakeVADStream extends (Object as unknown as { new (): VADStream }) { // We intentionally avoid extending the real VADStream (it is not exported as a value in JS output // in some bundling contexts). Instead we emulate the async iterator shape used by AudioRecognition. @@ -61,6 +89,8 @@ class FakeVAD extends VAD { } const alwaysTrueTurnDetector: _TurnDetector = { + model: 'test-turn-detector', + provider: 'test-provider', supportsLanguage: async () => true, unlikelyThreshold: async () => undefined, predictEndOfTurn: async () => 1.0, @@ -72,23 +102,15 @@ describe('AudioRecognition user_turn span parity', () => { it('creates user_turn and parents eou_detection under it (stt mode)', async () => { const { exporter } = setupInMemoryTracing(); - const hooks = { + const hooks: RecognitionHooks = { + onInterruption: vi.fn(), onStartOfSpeech: vi.fn(), onVADInferenceDone: vi.fn(), onEndOfSpeech: vi.fn(), onInterimTranscript: vi.fn(), onFinalTranscript: vi.fn(), onPreemptiveGeneration: vi.fn(), - retrieveChatCtx: () => - ({ - copy() { - return this; - }, - addMessage() {}, - toJSON() { - return { items: [] }; - }, - }) as any, + retrieveChatCtx: () => ChatContext.empty(), onEndOfTurn: vi.fn(async () => true), }; @@ -109,8 +131,8 @@ describe('AudioRecognition user_turn span parity', () => { { type: SpeechEventType.END_OF_SPEECH }, ]; - const sttNode = async () => - new ReadableStream({ + const sttNode: STTNode = async () => + new ReadableStream({ start(controller) { for (const ev of sttEvents) controller.enqueue(ev); controller.close(); @@ -118,8 +140,8 @@ describe('AudioRecognition user_turn span parity', () => { }); const ar = new AudioRecognition({ - recognitionHooks: hooks as any, - stt: sttNode as any, + recognitionHooks: hooks, + stt: sttNode, vad: undefined, turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'stt', @@ -140,6 +162,9 @@ describe('AudioRecognition user_turn span parity', () => { const eou = spanByName(spans, 'eou_detection'); expect(userTurn, 'user_turn span missing').toBeTruthy(); expect(eou, 'eou_detection span missing').toBeTruthy(); + if (!userTurn || !eou) { + throw new Error('expected user_turn and eou_detection spans'); + } expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId); @@ -158,23 +183,15 @@ describe('AudioRecognition user_turn span parity', () => { it('creates user_turn from VAD startTime (vad mode) and keeps same parenting', async () => { const { exporter } = setupInMemoryTracing(); - const hooks = { + const hooks: RecognitionHooks = { + onInterruption: vi.fn(), onStartOfSpeech: vi.fn(), onVADInferenceDone: vi.fn(), onEndOfSpeech: vi.fn(), onInterimTranscript: vi.fn(), onFinalTranscript: vi.fn(), onPreemptiveGeneration: vi.fn(), - retrieveChatCtx: () => - ({ - copy() { - return this; - }, - addMessage() {}, - toJSON() { - return { items: [] }; - }, - }) as any, + retrieveChatCtx: () => ChatContext.empty(), onEndOfTurn: vi.fn(async () => true), }; @@ -223,8 +240,8 @@ describe('AudioRecognition user_turn span parity', () => { }, ]; - const sttNode = async () => - new ReadableStream({ + const sttNode: STTNode = async () => + new ReadableStream({ start(controller) { for (const ev of sttEvents) controller.enqueue(ev); controller.close(); @@ -232,9 +249,9 @@ describe('AudioRecognition user_turn span parity', () => { }); const ar = new AudioRecognition({ - recognitionHooks: hooks as any, - stt: sttNode as any, - vad: new FakeVAD(vadEvents) as any, + recognitionHooks: hooks, + stt: sttNode, + vad: new FakeVAD(vadEvents), turnDetector: alwaysTrueTurnDetector, turnDetectionMode: 'vad', minEndpointingDelay: 0, @@ -253,9 +270,72 @@ describe('AudioRecognition user_turn span parity', () => { const eou = spanByName(spans, 'eou_detection'); expect(userTurn).toBeTruthy(); expect(eou).toBeTruthy(); + if (!userTurn || !eou) { + throw new Error('expected user_turn and eou_detection spans'); + } expect(eou.parentSpanId).toBe(userTurn.spanContext().spanId); expect(hooks.onStartOfSpeech).toHaveBeenCalled(); expect(hooks.onEndOfSpeech).toHaveBeenCalled(); }); + + it('parents user_speaking under user_turn when an explicit speech context is provided', () => { + const { exporter } = setupInMemoryTracing(); + const sessionSpan = tracer.startSpan({ name: 'agent_session', context: ROOT_CONTEXT }); + const sessionContext = trace.setSpan(ROOT_CONTEXT, sessionSpan); + const fakeSession = createFakeSession(sessionContext); + const userTurn = tracer.startSpan({ name: 'user_turn', context: sessionContext }); + const userTurnContext = trace.setSpan(sessionContext, userTurn); + const speakingStartedAt = Date.now() - 100; + const speakingEndedAt = Date.now(); + + otelContext.with(userTurnContext, () => { + AgentSession.prototype._updateUserState.call(fakeSession, 'speaking', { + lastSpeakingTime: speakingStartedAt, + otelContext: otelContext.active(), + }); + AgentSession.prototype._updateUserState.call(fakeSession, 'listening', { + lastSpeakingTime: speakingEndedAt, + otelContext: otelContext.active(), + }); + }); + + userTurn.end(); + sessionSpan.end(); + + const spans = exporter.getFinishedSpans(); + const userSpeaking = spanByName(spans, 'user_speaking'); + const exportedUserTurn = spanByName(spans, 'user_turn'); + expect(userSpeaking).toBeTruthy(); + expect(exportedUserTurn).toBeTruthy(); + if (!userSpeaking || !exportedUserTurn) { + throw new Error('expected user_speaking and user_turn spans'); + } + expect(userSpeaking.parentSpanId).toBe(exportedUserTurn.spanContext().spanId); + expect(userSpeaking.attributes['lk.participant_id']).toBe('p3'); + }); + + it('keeps user_speaking attached to the session root without an explicit speech context', () => { + const { exporter } = setupInMemoryTracing(); + const sessionSpan = tracer.startSpan({ name: 'agent_session', context: ROOT_CONTEXT }); + const sessionContext = trace.setSpan(ROOT_CONTEXT, sessionSpan); + const fakeSession = createFakeSession(sessionContext); + + AgentSession.prototype._updateUserState.call(fakeSession, 'speaking', { + lastSpeakingTime: Date.now() - 100, + }); + AgentSession.prototype._updateUserState.call(fakeSession, 'listening', { + lastSpeakingTime: Date.now(), + }); + + sessionSpan.end(); + + const spans = exporter.getFinishedSpans(); + const userSpeaking = spanByName(spans, 'user_speaking'); + expect(userSpeaking).toBeTruthy(); + if (!userSpeaking) { + throw new Error('expected user_speaking span'); + } + expect(userSpeaking.parentSpanId).toBe(sessionSpan.spanContext().spanId); + }); }); diff --git a/agents/src/voice/events.ts b/agents/src/voice/events.ts index aae866a6e..aedb93c23 100644 --- a/agents/src/voice/events.ts +++ b/agents/src/voice/events.ts @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import type { InterruptionDetectionError } from '../inference/interruption/errors.js'; +import type { OverlappingSpeechEvent } from '../inference/interruption/types.js'; import type { LanguageCode } from '../language.js'; import type { ChatMessage, @@ -26,6 +28,7 @@ export enum AgentSessionEventTypes { FunctionToolsExecuted = 'function_tools_executed', MetricsCollected = 'metrics_collected', SpeechCreated = 'speech_created', + OverlappingSpeech = 'overlapping_speech', Error = 'error', Close = 'close', } @@ -216,13 +219,13 @@ export const createSpeechCreatedEvent = ({ export type ErrorEvent = { type: 'error'; - error: RealtimeModelError | STTError | TTSError | LLMError | unknown; + error: RealtimeModelError | STTError | TTSError | LLMError | InterruptionDetectionError | unknown; source: LLM | STT | TTS | RealtimeModel | unknown; createdAt: number; }; export const createErrorEvent = ( - error: RealtimeModelError | STTError | TTSError | LLMError | unknown, + error: RealtimeModelError | STTError | TTSError | LLMError | InterruptionDetectionError | unknown, source: LLM | STT | TTS | RealtimeModel | unknown, createdAt: number = Date.now(), ): ErrorEvent => ({ @@ -234,14 +237,20 @@ export const createErrorEvent = ( export type CloseEvent = { type: 'close'; - error: RealtimeModelError | STTError | TTSError | LLMError | null; + error: RealtimeModelError | STTError | TTSError | LLMError | InterruptionDetectionError | null; reason: ShutdownReason; createdAt: number; }; export const createCloseEvent = ( reason: ShutdownReason, - error: RealtimeModelError | STTError | TTSError | LLMError | null = null, + error: + | RealtimeModelError + | STTError + | TTSError + | LLMError + | InterruptionDetectionError + | null = null, createdAt: number = Date.now(), ): CloseEvent => ({ type: 'close', @@ -258,5 +267,6 @@ export type AgentEvent = | ConversationItemAddedEvent | FunctionToolsExecutedEvent | SpeechCreatedEvent + | OverlappingSpeechEvent | ErrorEvent | CloseEvent; diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index 1f141ab37..d2eba8fc0 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -51,6 +51,7 @@ export class _LLMGenerationData { generatedText: string = ''; generatedToolCalls: FunctionCall[]; id: string; + ttft?: number; constructor( public readonly textStream: ReadableStream, @@ -416,6 +417,8 @@ export function performLLMInference( toolCtx: ToolContext, modelSettings: ModelSettings, controller: AbortController, + model?: string, + provider?: string, ): [Task, _LLMGenerationData] { const textStream = new IdentityTransform(); const toolCallStream = new IdentityTransform(); @@ -431,8 +434,17 @@ export function performLLMInference( ); span.setAttribute(traceTypes.ATTR_FUNCTION_TOOLS, JSON.stringify(Object.keys(toolCtx))); + if (model) { + span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); + } + if (provider) { + span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); + } + let llmStreamReader: ReadableStreamDefaultReader | null = null; let llmStream: ReadableStream | null = null; + const startTime = performance.now() / 1000; // Convert to seconds + let firstTokenReceived = false; try { llmStream = await node(chatCtx, toolCtx, modelSettings); @@ -455,6 +467,11 @@ export function performLLMInference( const { done, value: chunk } = result; if (done) break; + if (!firstTokenReceived) { + firstTokenReceived = true; + data.ttft = performance.now() / 1000 - startTime; + } + if (typeof chunk === 'string') { data.generatedText += chunk; await textWriter.write(chunk); @@ -493,6 +510,9 @@ export function performLLMInference( } span.setAttribute(traceTypes.ATTR_RESPONSE_TEXT, data.generatedText); + if (data.ttft !== undefined) { + span.setAttribute(traceTypes.ATTR_RESPONSE_TTFT, data.ttft); + } } catch (error) { if (error instanceof DOMException && error.name === 'AbortError') { // Abort signal was triggered, handle gracefully @@ -527,6 +547,8 @@ export function performTTSInference( text: ReadableStream, modelSettings: ModelSettings, controller: AbortController, + model?: string, + provider?: string, ): [Task, _TTSGenerationData] { const audioStream = new IdentityTransform(); const outputWriter = audioStream.writable.getWriter(); @@ -558,10 +580,27 @@ export function performTTSInference( } })(); - const _performTTSInferenceImpl = async (signal: AbortSignal) => { + let ttfb: number | undefined; + + const genData: _TTSGenerationData = { + audioStream: audioOutputStream, + timedTextsFut, + ttfb: undefined, + }; + + const _performTTSInferenceImpl = async (signal: AbortSignal, span: Span) => { + if (model) { + span.setAttribute(traceTypes.ATTR_GEN_AI_REQUEST_MODEL, model); + } + if (provider) { + span.setAttribute(traceTypes.ATTR_GEN_AI_PROVIDER_NAME, provider); + } + let ttsStreamReader: ReadableStreamDefaultReader | null = null; let ttsStream: ReadableStream | null = null; let pushedDuration = 0; + const startTime = performance.now() / 1000; // Convert to seconds + let firstByteReceived = false; try { ttsStream = await node(textOnlyStream.readable, modelSettings); @@ -595,6 +634,13 @@ export function performTTSInference( break; } + if (!firstByteReceived) { + firstByteReceived = true; + ttfb = performance.now() / 1000 - startTime; + genData.ttfb = ttfb; + span.setAttribute(traceTypes.ATTR_RESPONSE_TTFB, ttfb); + } + // Write the audio frame to the output stream await outputWriter.write(frame); @@ -631,6 +677,10 @@ export function performTTSInference( } throw error; } finally { + if (!timedTextsFut.done) { + // Ensure downstream consumers don't hang on errors. + timedTextsFut.resolve(null); + } ttsStreamReader?.releaseLock(); await ttsStream?.cancel(); await outputWriter.close(); @@ -642,16 +692,11 @@ export function performTTSInference( const currentContext = otelContext.active(); const inferenceTask = async (signal: AbortSignal) => - tracer.startActiveSpan(async () => _performTTSInferenceImpl(signal), { + tracer.startActiveSpan(async (span) => _performTTSInferenceImpl(signal, span), { name: 'tts_node', context: currentContext, }); - const genData: _TTSGenerationData = { - audioStream: audioOutputStream, - timedTextsFut, - }; - return [ Task.from((controller) => inferenceTask(controller.signal), controller, 'performTTSInference'), genData, @@ -719,7 +764,6 @@ export function performTextForwarding( export interface _AudioOut { audio: Array; - /** Future that will be set with the timestamp of the first frame's capture */ firstFrameFut: Future; } @@ -807,7 +851,6 @@ export function performAudioForwarding( ]; } -// function_tool span is already implemented in tracableToolExecution below (line ~796) export function performToolExecutions({ session, speechHandle, diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index 947013336..df3200cea 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -5,6 +5,16 @@ export { Agent, AgentTask, StopResponse, type AgentOptions, type ModelSettings } export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js'; export * from './avatar/index.js'; export * from './background_audio.js'; +export { + type TextInputCallback, + type TextInputEvent, + RemoteSession, + type RemoteSessionCallbacks, + type RemoteSessionEventTypes, + SessionHost, + SessionTransport, + RoomSessionTransport, +} from './remote_session.js'; export * from './events.js'; export { type TimedString } from './io.js'; export * from './report.js'; diff --git a/agents/src/voice/remote_session.ts b/agents/src/voice/remote_session.ts new file mode 100644 index 000000000..a55e39a8c --- /dev/null +++ b/agents/src/voice/remote_session.ts @@ -0,0 +1,1083 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Timestamp } from '@bufbuild/protobuf'; +import { AgentSession as pb } from '@livekit/protocol'; +import type { ByteStreamReader, Room, TextStreamInfo } from '@livekit/rtc-node'; +import type { TypedEventEmitter } from '@livekit/typed-emitter'; +import EventEmitter from 'events'; +import { TOPIC_SESSION_MESSAGES } from '../constants.js'; +import type { OverlappingSpeechEvent } from '../inference/interruption/types.js'; +import type { + ChatItem, + FunctionCall as FCItem, + FunctionCallOutput as FCOItem, +} from '../llm/chat_context.js'; +import type { ToolContext } from '../llm/tool_context.js'; +import { log } from '../log.js'; +import type { + InterruptionModelUsage, + LLMModelUsage, + STTModelUsage, + TTSModelUsage, +} from '../metrics/model_usage.js'; +import { Future, Task, shortuuid } from '../utils.js'; +import type { AgentSession, AgentSessionUsage } from './agent_session.js'; +import { + AgentSessionEventTypes, + type AgentState, + type AgentStateChangedEvent, + type ConversationItemAddedEvent, + type ErrorEvent, + type FunctionToolsExecutedEvent, + type MetricsCollectedEvent, + type UserInputTranscribedEvent, + type UserState, + type UserStateChangedEvent, +} from './events.js'; +import type { RoomIO } from './room_io/room_io.js'; + +// =========================================================================== +// Shared types (TextInput, Client event types, wire format aliases) +// =========================================================================== + +export interface TextInputEvent { + text: string; + info?: TextStreamInfo; + participantIdentity?: string; +} + +export type TextInputCallback = (session: AgentSession, ev: TextInputEvent) => void | Promise; + +/** @experimental */ +export type RemoteSessionEventTypes = + | 'agent_state_changed' + | 'user_state_changed' + | 'conversation_item_added' + | 'user_input_transcribed' + | 'function_tools_executed' + | 'overlapping_speech' + | 'session_usage' + | 'error'; + +/** @experimental */ +export type RemoteSessionCallbacks = { + agent_state_changed: (ev: pb.AgentSessionEvent_AgentStateChanged) => void; + user_state_changed: (ev: pb.AgentSessionEvent_UserStateChanged) => void; + conversation_item_added: (ev: pb.AgentSessionEvent_ConversationItemAdded) => void; + user_input_transcribed: (ev: pb.AgentSessionEvent_UserInputTranscribed) => void; + function_tools_executed: (ev: pb.AgentSessionEvent_FunctionToolsExecuted) => void; + overlapping_speech: (ev: pb.AgentSessionEvent_OverlappingSpeech) => void; + session_usage: (ev: pb.AgentSessionEvent_SessionUsageUpdated) => void; + error: (ev: pb.AgentSessionEvent_Error) => void; +}; + +// =========================================================================== +// SessionTransport +// =========================================================================== + +export abstract class SessionTransport { + async start(): Promise {} + abstract sendMessage(msg: pb.AgentSessionMessage): Promise; + abstract close(): Promise; + abstract [Symbol.asyncIterator](): AsyncIterator; +} + +export class RoomSessionTransport extends SessionTransport { + private readonly room: Room; + private handlerRegistered = false; + private closed = false; + private pendingMessages: pb.AgentSessionMessage[] = []; + private waitingResolve: ((value: IteratorResult) => void) | null = null; + private roomIO: RoomIO; + + constructor(room: Room, roomIO: RoomIO) { + super(); + this.room = room; + this.roomIO = roomIO; + } + + private getRemoteIdentity() { + return this.roomIO.linkedParticipant?.identity; + } + + override async start(): Promise { + if (this.handlerRegistered) return; + this.room.registerByteStreamHandler(TOPIC_SESSION_MESSAGES, this.onByteStream); + this.handlerRegistered = true; + } + + private onByteStream = (reader: ByteStreamReader, participantInfo: { identity: string }) => { + if (this.getRemoteIdentity() && participantInfo.identity !== this.getRemoteIdentity()) { + return; + } + this.readStream(reader).catch((e) => { + log().warn({ error: e }, 'failed to read binary stream message'); + }); + }; + + private async readStream(reader: ByteStreamReader): Promise { + try { + const chunks = await reader.readAll(); + let totalLength = 0; + for (const chunk of chunks) { + totalLength += chunk.length; + } + const data = new Uint8Array(totalLength); + let offset = 0; + for (const chunk of chunks) { + data.set(chunk, offset); + offset += chunk.length; + } + const msg = pb.AgentSessionMessage.fromBinary(data); + this.enqueue(msg); + } catch (e) { + if (!this.closed) { + log().warn({ error: e }, 'failed to parse binary stream message'); + } + } + } + + override async sendMessage(msg: pb.AgentSessionMessage): Promise { + if (this.closed || !this.room.isConnected) return; + + try { + const data = msg.toBinary(); + const opts: Record = { + topic: TOPIC_SESSION_MESSAGES, + name: shortuuid('AS_'), + }; + const remoteIdentity = this.getRemoteIdentity(); + if (remoteIdentity) { + opts.destinationIdentities = [remoteIdentity]; + } + const writer = await this.room.localParticipant!.streamBytes(opts); + await writer.write(new Uint8Array(data)); + await writer.close(); + } catch (e) { + log().warn({ error: e }, 'failed to send binary stream message'); + } + } + + override async close(): Promise { + if (this.closed) return; + this.closed = true; + + if (this.handlerRegistered) { + try { + this.room.unregisterByteStreamHandler(TOPIC_SESSION_MESSAGES); + } catch (e) { + log().debug({ error: e }, 'byte stream handler already unregistered'); + } + this.handlerRegistered = false; + } + + if (this.waitingResolve) { + this.waitingResolve({ + value: undefined as unknown as pb.AgentSessionMessage, + done: true, + }); + this.waitingResolve = null; + } + } + + private enqueue(msg: pb.AgentSessionMessage): void { + if (this.closed) return; + + if (this.waitingResolve) { + const resolve = this.waitingResolve; + this.waitingResolve = null; + resolve({ value: msg, done: false }); + } else { + this.pendingMessages.push(msg); + } + } + + override [Symbol.asyncIterator](): AsyncIterator { + return { + next: (): Promise> => { + if (this.closed && this.pendingMessages.length === 0) { + return Promise.resolve({ + value: undefined as unknown as pb.AgentSessionMessage, + done: true, + }); + } + + const pending = this.pendingMessages.shift(); + if (pending) { + return Promise.resolve({ value: pending, done: false }); + } + + return new Promise>((resolve) => { + this.waitingResolve = resolve; + }); + }, + return: (): Promise> => { + this.close(); + return Promise.resolve({ + value: undefined as unknown as pb.AgentSessionMessage, + done: true, + }); + }, + }; + } +} + +// =========================================================================== +// Enum maps +// =========================================================================== +const AGENT_STATE_MAP: Record = { + initializing: pb.AgentState.AS_INITIALIZING, + idle: pb.AgentState.AS_IDLE, + listening: pb.AgentState.AS_LISTENING, + thinking: pb.AgentState.AS_THINKING, + speaking: pb.AgentState.AS_SPEAKING, +}; + +const USER_STATE_MAP: Record = { + speaking: pb.UserState.US_SPEAKING, + listening: pb.UserState.US_LISTENING, + away: pb.UserState.US_AWAY, +}; + +// =========================================================================== +// Chat item / timestamp conversion helpers +// =========================================================================== +function msToTimestamp(ms: number): Timestamp { + return Timestamp.fromDate(new Date(ms)); +} + +function nowTimestamp(): Timestamp { + return Timestamp.fromDate(new Date()); +} + +function chatItemToProto(item: ChatItem): pb.ChatContext_ChatItem { + switch (item.type) { + case 'message': { + const msg = item; + const roleMap: Record = { + developer: pb.ChatRole.DEVELOPER, + system: pb.ChatRole.SYSTEM, + user: pb.ChatRole.USER, + assistant: pb.ChatRole.ASSISTANT, + }; + const content: pb.ChatMessage_ChatContent[] = []; + for (const c of msg.content) { + if (typeof c === 'string') { + content.push(new pb.ChatMessage_ChatContent({ payload: { case: 'text', value: c } })); + } + } + + const metricsReport = new pb.MetricsReport(); + if (msg.metrics.transcriptionDelay !== undefined) + metricsReport.transcriptionDelay = msg.metrics.transcriptionDelay; + if (msg.metrics.endOfTurnDelay !== undefined) + metricsReport.endOfTurnDelay = msg.metrics.endOfTurnDelay; + if (msg.metrics.onUserTurnCompletedDelay !== undefined) + metricsReport.onUserTurnCompletedDelay = msg.metrics.onUserTurnCompletedDelay; + if (msg.metrics.llmNodeTtft !== undefined) + metricsReport.llmNodeTtft = msg.metrics.llmNodeTtft; + if (msg.metrics.ttsNodeTtfb !== undefined) + metricsReport.ttsNodeTtfb = msg.metrics.ttsNodeTtfb; + if (msg.metrics.e2eLatency !== undefined) metricsReport.e2eLatency = msg.metrics.e2eLatency; + + const pbMsg = new pb.ChatMessage({ + id: msg.id, + role: roleMap[msg.role] ?? pb.ChatRole.ASSISTANT, + content, + interrupted: msg.interrupted, + metrics: metricsReport, + createdAt: msToTimestamp(msg.createdAt), + }); + if (msg.transcriptConfidence !== undefined) { + pbMsg.transcriptConfidence = msg.transcriptConfidence; + } + return new pb.ChatContext_ChatItem({ item: { case: 'message', value: pbMsg } }); + } + case 'function_call': { + const fc = item; + return new pb.ChatContext_ChatItem({ + item: { + case: 'functionCall', + value: new pb.FunctionCall({ + id: fc.id, + callId: fc.callId, + name: fc.name, + arguments: fc.args, + createdAt: msToTimestamp(fc.createdAt), + }), + }, + }); + } + case 'function_call_output': { + const fco = item; + return new pb.ChatContext_ChatItem({ + item: { + case: 'functionCallOutput', + value: new pb.FunctionCallOutput({ + id: fco.id, + callId: fco.callId, + name: fco.name, + output: fco.output, + isError: fco.isError, + createdAt: msToTimestamp(fco.createdAt), + }), + }, + }); + } + case 'agent_handoff': { + const ah = item; + return new pb.ChatContext_ChatItem({ + item: { + case: 'agentHandoff', + value: new pb.AgentHandoff({ + id: ah.id, + oldAgentId: ah.oldAgentId, + newAgentId: ah.newAgentId, + createdAt: msToTimestamp(ah.createdAt), + }), + }, + }); + } + } +} + +// =========================================================================== +// Usage conversion helpers +// =========================================================================== +function sessionUsageToProto(usage: AgentSessionUsage): pb.AgentSessionUsage { + const modelUsages: pb.ModelUsage[] = []; + for (const mu of usage.modelUsage) { + switch (mu.type) { + case 'llm_usage': { + const lu = mu as Partial; + modelUsages.push( + new pb.ModelUsage({ + usage: { + case: 'llm', + value: new pb.LLMModelUsage({ + provider: lu.provider ?? '', + model: lu.model ?? '', + inputTokens: lu.inputTokens ?? 0, + inputCachedTokens: lu.inputCachedTokens ?? 0, + inputAudioTokens: lu.inputAudioTokens ?? 0, + inputCachedAudioTokens: lu.inputCachedAudioTokens ?? 0, + inputTextTokens: lu.inputTextTokens ?? 0, + inputCachedTextTokens: lu.inputCachedTextTokens ?? 0, + inputImageTokens: lu.inputImageTokens ?? 0, + inputCachedImageTokens: lu.inputCachedImageTokens ?? 0, + outputTokens: lu.outputTokens ?? 0, + outputAudioTokens: lu.outputAudioTokens ?? 0, + outputTextTokens: lu.outputTextTokens ?? 0, + sessionDuration: (lu.sessionDurationMs ?? 0) / 1000, + }), + }, + }), + ); + break; + } + case 'tts_usage': { + const tu = mu as Partial; + modelUsages.push( + new pb.ModelUsage({ + usage: { + case: 'tts', + value: new pb.TTSModelUsage({ + provider: tu.provider ?? '', + model: tu.model ?? '', + inputTokens: tu.inputTokens ?? 0, + outputTokens: tu.outputTokens ?? 0, + charactersCount: tu.charactersCount ?? 0, + audioDuration: (tu.audioDurationMs ?? 0) / 1000, + }), + }, + }), + ); + break; + } + case 'stt_usage': { + const su = mu as Partial; + modelUsages.push( + new pb.ModelUsage({ + usage: { + case: 'stt', + value: new pb.STTModelUsage({ + provider: su.provider ?? '', + model: su.model ?? '', + inputTokens: su.inputTokens ?? 0, + outputTokens: su.outputTokens ?? 0, + audioDuration: (su.audioDurationMs ?? 0) / 1000, + }), + }, + }), + ); + break; + } + case 'interruption_usage': { + const iu = mu as Partial; + modelUsages.push( + new pb.ModelUsage({ + usage: { + case: 'interruption', + value: new pb.InterruptionModelUsage({ + provider: iu.provider ?? '', + model: iu.model ?? '', + totalRequests: iu.totalRequests ?? 0, + }), + }, + }), + ); + break; + } + } + } + return new pb.AgentSessionUsage({ modelUsage: modelUsages }); +} + +function toolNames(toolCtx: ToolContext | undefined): string[] { + if (!toolCtx) return []; + return Object.keys(toolCtx); +} + +function protoSerializeOptions(opts: { + turnHandling?: { endpointing?: unknown; interruption?: unknown }; + maxToolSteps?: number; + userAwayTimeout?: number | null; + preemptiveGeneration?: boolean; + useTtsAlignedTranscript?: boolean; +}): Record { + return { + endpointing: JSON.stringify(opts.turnHandling?.endpointing ?? {}), + interruption: JSON.stringify(opts.turnHandling?.interruption ?? {}), + max_tool_steps: String(opts.maxToolSteps ?? 0), + user_away_timeout: String(opts.userAwayTimeout ?? ''), + preemptive_generation: String(opts.preemptiveGeneration ?? false), + use_tts_aligned_transcript: String(opts.useTtsAlignedTranscript ?? false), + }; +} + +// =========================================================================== +// SessionHost (protobuf-based server-side handler) +// =========================================================================== +export class SessionHost { + private readonly transport: SessionTransport; + private session: AgentSession | undefined; + private started = false; + private eventsRegistered = false; + private recvTask: Task | undefined; + private readonly tasks = new Set>(); + private textInputCb: TextInputCallback | undefined; + + constructor(transport: SessionTransport) { + this.transport = transport; + } + + registerSession(session: AgentSession): void { + this.session = session; + if (!this.eventsRegistered) { + this.eventsRegistered = true; + session.on(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged); + session.on(AgentSessionEventTypes.UserStateChanged, this.onUserStateChanged); + session.on(AgentSessionEventTypes.ConversationItemAdded, this.onConversationItemAdded); + session.on(AgentSessionEventTypes.UserInputTranscribed, this.onUserInputTranscribed); + session.on(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted); + session.on(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected); + session.on(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech); + session.on(AgentSessionEventTypes.Error, this.onHostError); + } + } + + registerTextInput(textInputCb: TextInputCallback): void { + this.textInputCb = textInputCb; + } + + async start(): Promise { + if (this.started) return; + this.started = true; + await this.transport.start(); + this.recvTask = Task.from(async () => this.recvLoop()); + } + + async close(): Promise { + if (!this.started) return; + this.started = false; + + if (this.session && this.eventsRegistered) { + this.eventsRegistered = false; + this.session.off(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged); + this.session.off(AgentSessionEventTypes.UserStateChanged, this.onUserStateChanged); + this.session.off(AgentSessionEventTypes.ConversationItemAdded, this.onConversationItemAdded); + this.session.off(AgentSessionEventTypes.UserInputTranscribed, this.onUserInputTranscribed); + this.session.off(AgentSessionEventTypes.FunctionToolsExecuted, this.onFunctionToolsExecuted); + this.session.off(AgentSessionEventTypes.MetricsCollected, this.onMetricsCollected); + this.session.off(AgentSessionEventTypes.OverlappingSpeech, this.onOverlappingSpeech); + this.session.off(AgentSessionEventTypes.Error, this.onHostError); + } + + if (this.recvTask) { + this.recvTask.cancel(); + } + + await Promise.allSettled([...this.tasks].map((task) => task.cancelAndWait())); + this.tasks.clear(); + + await this.transport.close(); + } + + private async recvLoop(): Promise { + try { + for await (const msg of this.transport) { + if (msg.message.case === 'request') { + if (this.session) { + this.trackTask( + Task.from(async () => this.handleRequestSafe(msg.message.value as pb.SessionRequest)), + ); + } + } + } + } catch (e) { + if (this.started) { + log().warn({ error: e }, 'error processing session message'); + } + } + } + + private sendEvent(event: pb.AgentSessionEvent): void { + const msg = new pb.AgentSessionMessage({ + message: { case: 'event', value: event }, + }); + this.trackTask(Task.from(async () => this.transport.sendMessage(msg))); + } + + private emitEvent( + event: Event, + createdAt?: number, + ): void { + this.sendEvent( + new pb.AgentSessionEvent({ + createdAt: createdAt ? msToTimestamp(createdAt) : nowTimestamp(), + event: event, + }), + ); + } + + private onAgentStateChanged = (event: AgentStateChangedEvent): void => { + this.emitEvent( + { + case: 'agentStateChanged', + value: new pb.AgentSessionEvent_AgentStateChanged({ + oldState: AGENT_STATE_MAP[event.oldState], + newState: AGENT_STATE_MAP[event.newState], + }), + }, + event.createdAt, + ); + }; + + private onUserStateChanged = (event: UserStateChangedEvent): void => { + this.emitEvent( + { + case: 'userStateChanged', + value: new pb.AgentSessionEvent_UserStateChanged({ + oldState: USER_STATE_MAP[event.oldState], + newState: USER_STATE_MAP[event.newState], + }), + }, + event.createdAt, + ); + }; + + private onUserInputTranscribed = (event: UserInputTranscribedEvent): void => { + this.emitEvent( + { + case: 'userInputTranscribed', + value: new pb.AgentSessionEvent_UserInputTranscribed({ + transcript: event.transcript, + isFinal: event.isFinal, + }), + }, + event.createdAt, + ); + }; + + private onConversationItemAdded = (event: ConversationItemAddedEvent): void => { + this.emitEvent( + { + case: 'conversationItemAdded', + value: new pb.AgentSessionEvent_ConversationItemAdded({ + item: chatItemToProto(event.item), + }), + }, + event.createdAt, + ); + }; + + private onFunctionToolsExecuted = (event: FunctionToolsExecutedEvent): void => { + const pbCalls = event.functionCalls.map( + (fc: FCItem) => new pb.FunctionCall({ name: fc.name, arguments: fc.args, callId: fc.callId }), + ); + const pbOutputs = event.functionCallOutputs + .filter((fco): fco is FCOItem => fco != null) + .map( + (fco: FCOItem) => + new pb.FunctionCallOutput({ + callId: fco.callId, + output: fco.output, + isError: fco.isError, + }), + ); + this.emitEvent( + { + case: 'functionToolsExecuted', + value: new pb.AgentSessionEvent_FunctionToolsExecuted({ + functionCalls: pbCalls, + functionCallOutputs: pbOutputs, + }), + }, + event.createdAt, + ); + }; + + private onOverlappingSpeech = (event: OverlappingSpeechEvent): void => { + const value = new pb.AgentSessionEvent_OverlappingSpeech({ + isInterruption: event.isInterruption, + detectionDelay: event.detectionDelayInS, + detectedAt: msToTimestamp(event.detectedAt), + }); + if (event.overlapStartedAt != null) { + value.overlapStartedAt = msToTimestamp(event.overlapStartedAt); + } + this.emitEvent({ case: 'overlappingSpeech', value }); + }; + + private onMetricsCollected = (event: MetricsCollectedEvent): void => { + if (!this.session) return; + this.emitEvent( + { + case: 'sessionUsageUpdated', + value: new pb.AgentSessionEvent_SessionUsageUpdated({ + usage: sessionUsageToProto(this.session.usage), + }), + }, + event.createdAt, + ); + }; + + private onHostError = (event: ErrorEvent): void => { + this.emitEvent( + { + case: 'error', + value: new pb.AgentSessionEvent_Error({ + message: event.error ? String(event.error) : 'Unknown error', + }), + }, + event.createdAt, + ); + }; + + private async handleRequestSafe(req: pb.SessionRequest): Promise { + try { + await this.handleRequest(req); + } catch (e) { + log().warn({ error: e, requestId: req.requestId }, 'error handling session request'); + try { + const resp = new pb.AgentSessionMessage({ + message: { + case: 'response', + value: new pb.SessionResponse({ + requestId: req.requestId, + error: 'internal error', + }), + }, + }); + await this.transport.sendMessage(resp); + } catch (e) { + log().debug({ error: e }, 'failed to send error response'); + } + } + } + + private async handleRequest(req: pb.SessionRequest): Promise { + if (!this.session) return; + + switch (req.request.case) { + case 'ping': + return this.sendResponse(req.requestId, { + case: 'pong', + value: new pb.SessionResponse_Pong(), + }); + case 'getChatHistory': + return this.handleGetChatHistory(req.requestId); + case 'getAgentInfo': + return this.handleGetAgentInfo(req.requestId); + case 'runInput': + return this.handleRunInput(req.requestId, req.request.value); + case 'getSessionState': + return this.handleGetSessionState(req.requestId); + case 'getRtcStats': + return this.sendResponse(req.requestId, { + case: 'getRtcStats', + value: new pb.SessionResponse_GetRTCStatsResponse({ + publisherStats: [], + subscriberStats: [], + }), + }); + case 'getSessionUsage': + return this.handleGetSessionUsage(req.requestId); + } + } + + private async handleGetChatHistory(requestId: string): Promise { + const items = this.session!.history.items.map(chatItemToProto); + return this.sendResponse(requestId, { + case: 'getChatHistory', + value: new pb.SessionResponse_GetChatHistoryResponse({ items }), + }); + } + + private async handleGetAgentInfo(requestId: string): Promise { + const agent = this.session!.currentAgent; + return this.sendResponse(requestId, { + case: 'getAgentInfo', + value: new pb.SessionResponse_GetAgentInfoResponse({ + id: agent.id, + instructions: agent.instructions, + tools: toolNames(agent.toolCtx), + chatCtx: agent.chatCtx.items.map(chatItemToProto), + }), + }); + } + + private async handleRunInput( + requestId: string, + input: pb.SessionRequest_RunInput, + ): Promise { + const text = input.text; + let items: pb.ChatContext_ChatItem[] = []; + let error: string | undefined; + + if (text) { + if (this.textInputCb) { + const cbResult = this.textInputCb(this.session!, { text }); + if (cbResult instanceof Promise) { + await cbResult; + } + } else { + try { + await this.session!.interrupt({ force: true }).await; + } catch { + // ignore + } + + const result = this.session!.run({ userInput: text }); + try { + await result.wait(); + } catch (e) { + error = e instanceof Error ? e.message : String(e); + } + items = result.events.map((ev) => chatItemToProto(ev.item)); + } + } + + return this.sendResponse( + requestId, + { + case: 'runInput', + value: new pb.SessionResponse_RunInputResponse({ items }), + }, + error, + ); + } + + private async handleGetSessionState(requestId: string): Promise { + const agent = this.session!.currentAgent; + const startedAt = this.session!._startedAt ?? Date.now(); + return this.sendResponse(requestId, { + case: 'getSessionState', + value: new pb.SessionResponse_GetSessionStateResponse({ + agentState: AGENT_STATE_MAP[this.session!.agentState], + userState: USER_STATE_MAP[this.session!.userState], + agentId: agent.id, + options: protoSerializeOptions({ + turnHandling: this.session!.sessionOptions.turnHandling, + maxToolSteps: this.session!.sessionOptions.maxToolSteps, + userAwayTimeout: this.session!.sessionOptions.userAwayTimeout, + preemptiveGeneration: this.session!.sessionOptions.preemptiveGeneration, + useTtsAlignedTranscript: this.session!.sessionOptions.useTtsAlignedTranscript, + }), + createdAt: msToTimestamp(startedAt), + }), + }); + } + + private async handleGetSessionUsage(requestId: string): Promise { + return this.sendResponse(requestId, { + case: 'getSessionUsage', + value: new pb.SessionResponse_GetSessionUsageResponse({ + usage: sessionUsageToProto(this.session!.usage), + createdAt: nowTimestamp(), + }), + }); + } + + private async sendResponse( + requestId: string, + response: pb.SessionResponse['response'], + error?: string, + ): Promise { + await this.transport.sendMessage( + new pb.AgentSessionMessage({ + message: { + case: 'response', + value: new pb.SessionResponse({ requestId, response, error }), + }, + }), + ); + } + + private trackTask(task: Task): void { + this.tasks.add(task); + task.addDoneCallback(() => { + this.tasks.delete(task); + }); + } +} + +// =========================================================================== +// RemoteSession (protobuf-based client-side interface) +// =========================================================================== + +/** @experimental */ +export class RemoteSession extends (EventEmitter as new () => TypedEventEmitter) { + private readonly transport: SessionTransport; + private started = false; + + private readonly tasks = new Set>(); + private readonly pendingRequests = new Map>(); + private recvTask: Task | undefined; + private readonly _logger = log(); + + constructor(transport: SessionTransport) { + super(); + this.transport = transport; + } + + static fromRoom(room: Room, roomIO: RoomIO): RemoteSession { + const transport = new RoomSessionTransport(room, roomIO); + return new RemoteSession(transport); + } + + async start(): Promise { + if (this.started) return; + this.started = true; + await this.transport.start(); + this.recvTask = Task.from(async () => this.recvLoop()); + } + + async close(): Promise { + if (!this.started) return; + this.started = false; + + if (this.recvTask) { + this.recvTask.cancel(); + } + + for (const pending of this.pendingRequests.values()) { + pending.reject(new Error('RemoteSession closed')); + } + this.pendingRequests.clear(); + + for (const task of this.tasks) { + task.cancel(); + } + this.tasks.clear(); + + await this.transport.close(); + } + + private async recvLoop(): Promise { + try { + for await (const msg of this.transport) { + switch (msg.message.case) { + case 'event': + this.dispatchEvent(msg.message.value); + break; + case 'response': + this.dispatchResponse(msg.message.value); + break; + } + } + } catch (e) { + if (this.started) { + this._logger.warn({ error: e }, 'error in RemoteSession recv loop'); + } + } + } + + private dispatchEvent(event: pb.AgentSessionEvent): void { + const ev = event.event; + switch (ev.case) { + case 'agentStateChanged': + this.emit('agent_state_changed', ev.value); + break; + case 'userStateChanged': + this.emit('user_state_changed', ev.value); + break; + case 'userInputTranscribed': + this.emit('user_input_transcribed', ev.value); + break; + case 'conversationItemAdded': + this.emit('conversation_item_added', ev.value); + break; + case 'functionToolsExecuted': + this.emit('function_tools_executed', ev.value); + break; + case 'overlappingSpeech': + this.emit('overlapping_speech', ev.value); + break; + case 'sessionUsageUpdated': + this.emit('session_usage', ev.value); + break; + case 'error': + this.emit('error', ev.value); + break; + } + } + + private dispatchResponse(response: pb.SessionResponse): void { + const future = this.pendingRequests.get(response.requestId); + this.pendingRequests.delete(response.requestId); + if (future && !future.done) { + future.resolve(response); + } + } + + private async sendRequest( + buildReq: (requestId: string) => pb.SessionRequest, + timeout = 60000, + ): Promise { + const requestId = shortuuid('req_'); + const req = buildReq(requestId); + req.requestId = requestId; + + const future = new Future(); + this.pendingRequests.set(requestId, future); + + const msg = new pb.AgentSessionMessage({ + message: { case: 'request', value: req }, + }); + await this.transport.sendMessage(msg); + + const timer = setTimeout(() => { + if (!future.done) { + this.pendingRequests.delete(requestId); + future.reject(new Error('RemoteSession request timed out')); + } + }, timeout); + + try { + const response = await future.await; + if (response.error) { + throw new Error(response.error); + } + return response; + } finally { + clearTimeout(timer); + } + } + + async fetchSessionState(): Promise { + const resp = await this.sendRequest( + (id) => + new pb.SessionRequest({ + requestId: id, + request: { case: 'getSessionState', value: new pb.SessionRequest_GetSessionState() }, + }), + ); + if (resp.response.case !== 'getSessionState') { + throw new Error('unexpected response type'); + } + return resp.response.value; + } + + async fetchChatHistory(): Promise { + const resp = await this.sendRequest( + (id) => + new pb.SessionRequest({ + requestId: id, + request: { case: 'getChatHistory', value: new pb.SessionRequest_GetChatHistory() }, + }), + ); + if (resp.response.case !== 'getChatHistory') { + throw new Error('unexpected response type'); + } + return resp.response.value; + } + + async fetchAgentInfo(): Promise { + const resp = await this.sendRequest( + (id) => + new pb.SessionRequest({ + requestId: id, + request: { case: 'getAgentInfo', value: new pb.SessionRequest_GetAgentInfo() }, + }), + ); + if (resp.response.case !== 'getAgentInfo') { + throw new Error('unexpected response type'); + } + return resp.response.value; + } + + async sendMessage( + text: string, + responseTimeout = 60000, + ): Promise { + const resp = await this.sendRequest( + (id) => + new pb.SessionRequest({ + requestId: id, + request: { case: 'runInput', value: new pb.SessionRequest_RunInput({ text }) }, + }), + responseTimeout, + ); + if (resp.response.case !== 'runInput') { + throw new Error('unexpected response type'); + } + return resp.response.value; + } + + async fetchRtcStats(): Promise { + const resp = await this.sendRequest( + (id) => + new pb.SessionRequest({ + requestId: id, + request: { case: 'getRtcStats', value: new pb.SessionRequest_GetRTCStats() }, + }), + ); + if (resp.response.case !== 'getRtcStats') { + throw new Error('unexpected response type'); + } + return resp.response.value; + } + + async fetchSessionUsage(): Promise { + const resp = await this.sendRequest( + (id) => + new pb.SessionRequest({ + requestId: id, + request: { case: 'getSessionUsage', value: new pb.SessionRequest_GetSessionUsage() }, + }), + ); + if (resp.response.case !== 'getSessionUsage') { + throw new Error('unexpected response type'); + } + return resp.response.value; + } + + private trackTask(task: Task): void { + this.tasks.add(task); + task.addDoneCallback(() => { + this.tasks.delete(task); + }); + } +} diff --git a/agents/src/voice/report.test.ts b/agents/src/voice/report.test.ts new file mode 100644 index 000000000..a774dfd40 --- /dev/null +++ b/agents/src/voice/report.test.ts @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import type { AgentSessionOptions, VoiceOptions } from './agent_session.js'; +import { createSessionReport, sessionReportToJSON } from './report.js'; + +type ReportOptions = AgentSessionOptions & Partial; + +function baseOptions(): ReportOptions { + return { + maxToolSteps: 3, + preemptiveGeneration: false, + userAwayTimeout: 15, + useTtsAlignedTranscript: true, + turnHandling: {}, + }; +} + +function serializeOptions(options: ReportOptions) { + const report = createSessionReport({ + jobId: 'job', + roomId: 'room-id', + room: 'room', + options, + events: [], + chatHistory: ChatContext.empty(), + enableRecording: false, + timestamp: 0, + startedAt: 0, + }); + + const payload = sessionReportToJSON(report); + return payload.options as Record; +} + +describe('sessionReportToJSON', () => { + it('serializes interruption and endpointing values from turnHandling', () => { + const options = baseOptions(); + options.turnHandling = { + interruption: { + mode: 'adaptive', + discardAudioIfUninterruptible: false, + minDuration: 1200, + minWords: 2, + }, + endpointing: { + minDelay: 900, + maxDelay: 4500, + }, + }; + + const serialized = serializeOptions(options); + expect(serialized).toMatchObject({ + allow_interruptions: true, + discard_audio_if_uninterruptible: false, + min_interruption_duration: 1200, + min_interruption_words: 2, + min_endpointing_delay: 900, + max_endpointing_delay: 4500, + max_tool_steps: 3, + }); + }); + + it('prefers turnHandling values over deprecated flat fields', () => { + const options = baseOptions(); + options.allowInterruptions = false; + options.discardAudioIfUninterruptible = true; + options.minInterruptionDuration = 400; + options.minInterruptionWords = 1; + options.minEndpointingDelay = 500; + options.maxEndpointingDelay = 2500; + options.turnHandling = { + interruption: { + mode: 'vad', + discardAudioIfUninterruptible: false, + minDuration: 1400, + minWords: 4, + }, + endpointing: { + minDelay: 700, + maxDelay: 3900, + }, + }; + + const serialized = serializeOptions(options); + expect(serialized).toMatchObject({ + allow_interruptions: true, + discard_audio_if_uninterruptible: false, + min_interruption_duration: 1400, + min_interruption_words: 4, + min_endpointing_delay: 700, + max_endpointing_delay: 3900, + max_tool_steps: 3, + }); + }); + + it('serializes allow_interruptions from interruption.enabled when present', () => { + const options = baseOptions(); + options.allowInterruptions = true; + options.turnHandling = { + interruption: { + enabled: false, + mode: 'adaptive', + }, + }; + + const serialized = serializeOptions(options); + expect(serialized).toMatchObject({ + allow_interruptions: false, + max_tool_steps: 3, + }); + }); + + it('falls back to deprecated flat fields when turnHandling values are absent', () => { + const options = baseOptions(); + options.allowInterruptions = false; + options.discardAudioIfUninterruptible = false; + options.minInterruptionDuration = 600; + options.minInterruptionWords = 3; + options.minEndpointingDelay = 1000; + options.maxEndpointingDelay = 5000; + + const serialized = serializeOptions(options); + expect(serialized).toMatchObject({ + allow_interruptions: false, + discard_audio_if_uninterruptible: false, + min_interruption_duration: 600, + min_interruption_words: 3, + min_endpointing_delay: 1000, + max_endpointing_delay: 5000, + max_tool_steps: 3, + }); + }); +}); diff --git a/agents/src/voice/report.ts b/agents/src/voice/report.ts index 49701a696..bf3fb1474 100644 --- a/agents/src/voice/report.ts +++ b/agents/src/voice/report.ts @@ -2,14 +2,17 @@ // // SPDX-License-Identifier: Apache-2.0 import type { ChatContext } from '../llm/chat_context.js'; -import type { VoiceOptions } from './agent_session.js'; +import { type ModelUsage, filterZeroValues } from '../metrics/model_usage.js'; +import type { AgentSessionOptions, VoiceOptions } from './agent_session.js'; import type { AgentEvent } from './events.js'; +type ReportOptions = AgentSessionOptions & Partial; + export interface SessionReport { jobId: string; roomId: string; room: string; - options: VoiceOptions; + options: ReportOptions; events: AgentEvent[]; chatHistory: ChatContext; enableRecording: boolean; @@ -23,13 +26,15 @@ export interface SessionReport { audioRecordingStartedAt?: number; /** Duration of the session in milliseconds */ duration?: number; + /** Usage summaries for the session, one per model/provider combination */ + modelUsage?: ModelUsage[]; } export interface SessionReportOptions { jobId: string; roomId: string; room: string; - options: VoiceOptions; + options: ReportOptions; events: AgentEvent[]; chatHistory: ChatContext; enableRecording?: boolean; @@ -41,6 +46,8 @@ export interface SessionReportOptions { audioRecordingPath?: string; /** Timestamp when the audio recording started (milliseconds) */ audioRecordingStartedAt?: number; + /** Usage summaries for the session, one per model/provider combination */ + modelUsage?: ModelUsage[]; } export function createSessionReport(opts: SessionReportOptions): SessionReport { @@ -61,6 +68,7 @@ export function createSessionReport(opts: SessionReportOptions): SessionReport { audioRecordingStartedAt, duration: audioRecordingStartedAt !== undefined ? timestamp - audioRecordingStartedAt : undefined, + modelUsage: opts.modelUsage, }; } @@ -70,6 +78,37 @@ export function createSessionReport(opts: SessionReportOptions): SessionReport { // - Uploads to LiveKit Cloud observability endpoint with JWT auth export function sessionReportToJSON(report: SessionReport): Record { const events: Record[] = []; + const options = report.options; + const interruptionConfig = options.turnHandling?.interruption; + const endpointingConfig = options.turnHandling?.endpointing; + + // Keep backwards compatibility with deprecated fields + const allowInterruptions = + interruptionConfig?.enabled !== undefined + ? interruptionConfig.enabled + : interruptionConfig?.mode !== undefined + ? true + : options.allowInterruptions ?? options.voiceOptions?.allowInterruptions; + const discardAudioIfUninterruptible = + interruptionConfig?.discardAudioIfUninterruptible ?? + options.discardAudioIfUninterruptible ?? + options.voiceOptions?.discardAudioIfUninterruptible; + const minInterruptionDuration = + interruptionConfig?.minDuration ?? + options.minInterruptionDuration ?? + options.voiceOptions?.minInterruptionDuration; + const minInterruptionWords = + interruptionConfig?.minWords ?? + options.minInterruptionWords ?? + options.voiceOptions?.minInterruptionWords; + const minEndpointingDelay = + endpointingConfig?.minDelay ?? + options.minEndpointingDelay ?? + options.voiceOptions?.minEndpointingDelay; + const maxEndpointingDelay = + endpointingConfig?.maxDelay ?? + options.maxEndpointingDelay ?? + options.voiceOptions?.maxEndpointingDelay; for (const event of report.events) { if (event.type === 'metrics_collected') { @@ -85,16 +124,17 @@ export function sessionReportToJSON(report: SessionReport): Record void | Promise; - -const DEFAULT_TEXT_INPUT_CALLBACK: TextInputCallback = (sess: AgentSession, ev: TextInputEvent) => { +export const DEFAULT_TEXT_INPUT_CALLBACK: TextInputCallback = (sess, ev) => { sess.interrupt(); sess.generateReply({ userInput: ev.text }); }; @@ -136,6 +128,7 @@ export class RoomIO { private agentTranscriptOutput?: ParalellTextOutput; private transcriptionSynchronizer?: TranscriptionSynchronizer; private participantIdentity: string | null = null; + private textStreamHandlerRegistered = false; private participantAvailableFuture: Future = new Future(); private roomConnectedFuture: Future = new Future(); @@ -146,8 +139,6 @@ export class RoomIO { private forwardUserTranscriptTask?: Task; private initTask?: Task; - private textStreamHandlerRegistered = false; - private logger = log(); constructor({ @@ -283,7 +274,7 @@ export class RoomIO { }; private onUserTextInput = (reader: TextStreamReader, participantInfo: { identity: string }) => { - if (participantInfo.identity !== this.participantIdentity) { + if (this.participantIdentity && participantInfo.identity !== this.participantIdentity) { return; } @@ -299,7 +290,7 @@ export class RoomIO { const textInputResult = this.inputOptions.textInputCallback!(this.agentSession, { text, info: reader.info, - participant, + participantIdentity: participantInfo.identity, }); // check if callback is a Promise @@ -387,6 +378,10 @@ export class RoomIO { return this.participantAvailableFuture.done; } + get rtcRoom(): Room { + return this.room; + } + get linkedParticipant(): RemoteParticipant | undefined { if (!this.isParticipantAvailable) { return undefined; @@ -439,6 +434,8 @@ export class RoomIO { } start() { + // -- create inputs -- + if (this.inputOptions.textEnabled) { try { this.room.registerTextStreamHandler(TOPIC_CHAT, this.onUserTextInput); @@ -450,7 +447,6 @@ export class RoomIO { } } - // -- create inputs -- if (this.inputOptions.audioEnabled) { this.audioInput = new ParticipantAudioInputStream({ room: this.room, diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts new file mode 100644 index 000000000..f2603e00f --- /dev/null +++ b/agents/src/voice/turn_config/endpointing.ts @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +/** + * Configuration for endpointing, which determines when the user's turn is complete. + */ +export interface EndpointingOptions { + /** + * Endpointing mode. `"fixed"` uses a fixed delay, `"dynamic"` adjusts delay based on + * end-of-utterance prediction. + * @defaultValue "fixed" + */ + mode: 'fixed' | 'dynamic'; + /** + * Minimum time in milliseconds since the last detected speech before the agent declares the user's + * turn complete. In VAD mode this effectively behaves like `max(VAD silence, minDelay)`; + * in STT mode it is applied after the STT end-of-speech signal, so it can be additive with + * the STT provider's endpointing delay. + * @defaultValue 500 + */ + minDelay: number; + /** + * Maximum time in milliseconds the agent will wait before terminating the turn. + * @defaultValue 3000 + */ + maxDelay: number; +} + +export const defaultEndpointingOptions = { + mode: 'fixed', + minDelay: 500, + maxDelay: 3000, +} as const satisfies EndpointingOptions; diff --git a/agents/src/voice/turn_config/interruption.ts b/agents/src/voice/turn_config/interruption.ts new file mode 100644 index 000000000..63cf92c9d --- /dev/null +++ b/agents/src/voice/turn_config/interruption.ts @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +/** + * Configuration for interruption handling. + */ +export interface InterruptionOptions { + /** + * Whether interruptions are enabled. + * @defaultValue true + */ + enabled: boolean; + /** + * Interruption handling strategy. `"adaptive"` for ML-based detection, `"vad"` for simple + * voice-activity detection. `undefined` means auto-detect. + * @defaultValue undefined + */ + mode: 'adaptive' | 'vad' | undefined; + /** + * When `true`, buffered audio is dropped while the agent is speaking and cannot be interrupted. + * @defaultValue true + */ + discardAudioIfUninterruptible: boolean; + /** + * Minimum speech length in milliseconds to register as an interruption. + * @defaultValue 500 + */ + minDuration: number; + /** + * Minimum number of words to consider an interruption, only used if STT is enabled. + * @defaultValue 0 + */ + minWords: number; + /** + * If set, emit an `agentFalseInterruption` event after this amount of time if the user is + * silent and no user transcript is detected after the interruption. Set to `undefined` to + * disable. The value is in milliseconds. + * @defaultValue 2000 + */ + falseInterruptionTimeout: number; + /** + * Whether to resume the false interruption after the `falseInterruptionTimeout`. + * @defaultValue true + */ + resumeFalseInterruption: boolean; +} + +export const defaultInterruptionOptions = { + enabled: true, + mode: undefined, + discardAudioIfUninterruptible: true, + minDuration: 500, + minWords: 0, + falseInterruptionTimeout: 2000, + resumeFalseInterruption: true, +} as const satisfies InterruptionOptions; diff --git a/agents/src/voice/turn_config/turn_handling.ts b/agents/src/voice/turn_config/turn_handling.ts new file mode 100644 index 000000000..1458fb663 --- /dev/null +++ b/agents/src/voice/turn_config/turn_handling.ts @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { TurnDetectionMode } from '../agent_session.js'; +import { type EndpointingOptions, defaultEndpointingOptions } from './endpointing.js'; +import { type InterruptionOptions, defaultInterruptionOptions } from './interruption.js'; + +/** + * Configuration for the turn handling system. Used to configure the turn taking behavior of the + * session. + */ +export interface TurnHandlingOptions { + /** + * Strategy for deciding when the user has finished speaking. + * + * - `"stt"` – rely on speech-to-text end-of-utterance cues + * - `"vad"` – rely on Voice Activity Detection start/stop cues + * - `"realtime_llm"` – use server-side detection from a realtime LLM + * - `"manual"` – caller controls turn boundaries explicitly + * + * If not set, the session chooses the best available mode in priority order + * `realtime_llm → vad → stt → manual`; it automatically falls back if the necessary model + * is missing. + */ + turnDetection: TurnDetectionMode | undefined; + /** + * Configuration for endpointing. + */ + endpointing: Partial; + /** + * Configuration for interruption handling. + */ + interruption: Partial; +} + +export interface InternalTurnHandlingOptions extends TurnHandlingOptions { + endpointing: EndpointingOptions; + interruption: InterruptionOptions; +} + +export const defaultTurnHandlingOptions: InternalTurnHandlingOptions = { + turnDetection: undefined, + interruption: defaultInterruptionOptions, + endpointing: defaultEndpointingOptions, +}; diff --git a/agents/src/voice/turn_config/utils.test.ts b/agents/src/voice/turn_config/utils.test.ts new file mode 100644 index 000000000..96bfd6972 --- /dev/null +++ b/agents/src/voice/turn_config/utils.test.ts @@ -0,0 +1,148 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { beforeAll, describe, expect, it } from 'vitest'; +import { initializeLogger } from '../../log.js'; +import { defaultAgentSessionOptions } from '../agent_session.js'; +import { defaultEndpointingOptions } from './endpointing.js'; +import { defaultInterruptionOptions } from './interruption.js'; +import { defaultTurnHandlingOptions } from './turn_handling.js'; +import { migrateLegacyOptions, migrateTurnHandling } from './utils.js'; + +beforeAll(() => { + initializeLogger({ pretty: true, level: 'info' }); +}); + +describe('migrateLegacyOptions', () => { + it('should return all defaults when no options are provided', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({}); + + expect(result.turnHandling).toEqual({ + turnDetection: defaultTurnHandlingOptions.turnDetection, + endpointing: defaultEndpointingOptions, + interruption: defaultInterruptionOptions, + }); + expect(result.maxToolSteps).toBe(defaultAgentSessionOptions.maxToolSteps); + expect(result.preemptiveGeneration).toBe(defaultAgentSessionOptions.preemptiveGeneration); + expect(result.userAwayTimeout).toBe(defaultAgentSessionOptions.userAwayTimeout); + }); + + it('should migrate legacy flat fields into nested turnHandling config', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + voiceOptions: { + minInterruptionDuration: 1000, + minInterruptionWords: 3, + discardAudioIfUninterruptible: false, + minEndpointingDelay: 800, + maxEndpointingDelay: 5000, + }, + }); + + expect(result.turnHandling.interruption!.minDuration).toBe(1000); + expect(result.turnHandling.interruption!.minWords).toBe(3); + expect(result.turnHandling.interruption!.discardAudioIfUninterruptible).toBe(false); + expect(result.turnHandling.endpointing!.minDelay).toBe(800); + expect(result.turnHandling.endpointing!.maxDelay).toBe(5000); + }); + + it('should set interruption.enabled to false when allowInterruptions is false', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + voiceOptions: { allowInterruptions: false }, + }); + + expect(result.turnHandling.interruption!.enabled).toBe(false); + }); + + it('should give top-level fields precedence over voiceOptions', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + voiceOptions: { + minInterruptionDuration: 1000, + maxEndpointingDelay: 5000, + maxToolSteps: 10, + }, + turnHandling: { + interruption: { + minDuration: 2000, + }, + endpointing: { + maxDelay: 8000, + }, + }, + maxToolSteps: 5, + }); + + expect(result.turnHandling.interruption!.minDuration).toBe(2000); + expect(result.turnHandling.endpointing!.maxDelay).toBe(8000); + expect(result.maxToolSteps).toBe(5); + }); + + it('should preserve top-level turnDetection in the result', () => { + const { agentSessionOptions: result } = migrateLegacyOptions({ + turnDetection: 'vad', + }); + + expect(result.turnHandling.turnDetection).toBe('vad'); + }); +}); + +describe('migrateTurnHandling', () => { + it('should return empty partial when no deprecated Agent fields are given', () => { + const result = migrateTurnHandling({}); + expect(result).toEqual({}); + }); + + it('should set interruption.enabled to false when allowInterruptions is false', () => { + const result = migrateTurnHandling({ allowInterruptions: false }); + expect(result.interruption).toEqual({ enabled: false }); + expect(result.endpointing).toBeUndefined(); + expect(result.turnDetection).toBeUndefined(); + }); + + it('should not set interruption when allowInterruptions is true or undefined', () => { + expect(migrateTurnHandling({ allowInterruptions: true })).toEqual({}); + expect(migrateTurnHandling({ allowInterruptions: undefined })).toEqual({}); + }); + + it('should map minEndpointingDelay to endpointing.minDelay', () => { + const result = migrateTurnHandling({ minEndpointingDelay: 800 }); + expect(result.endpointing).toEqual({ minDelay: 800 }); + }); + + it('should map maxEndpointingDelay to endpointing.maxDelay', () => { + const result = migrateTurnHandling({ maxEndpointingDelay: 5000 }); + expect(result.endpointing).toEqual({ maxDelay: 5000 }); + }); + + it('should pass through turnDetection', () => { + const result = migrateTurnHandling({ turnDetection: 'vad' }); + expect(result.turnDetection).toBe('vad'); + }); + + it('should combine all deprecated Agent fields', () => { + const result = migrateTurnHandling({ + turnDetection: 'stt', + allowInterruptions: false, + minEndpointingDelay: 400, + maxEndpointingDelay: 3000, + }); + expect(result.turnDetection).toBe('stt'); + expect(result.interruption).toEqual({ enabled: false }); + expect(result.endpointing).toEqual({ minDelay: 400, maxDelay: 3000 }); + }); + + it('should ignore deprecated Agent fields when explicit turnHandling is provided', () => { + const turnHandling = { + endpointing: { minDelay: 999, maxDelay: 4000 }, + interruption: { enabled: true }, + turnDetection: 'vad' as const, + }; + const result = migrateTurnHandling({ + turnHandling, + turnDetection: 'stt', + allowInterruptions: false, + minEndpointingDelay: 100, + maxEndpointingDelay: 200, + }); + expect(result).toEqual(turnHandling); + }); +}); diff --git a/agents/src/voice/turn_config/utils.ts b/agents/src/voice/turn_config/utils.ts new file mode 100644 index 000000000..59022e58a --- /dev/null +++ b/agents/src/voice/turn_config/utils.ts @@ -0,0 +1,167 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { log } from '../../log.js'; +import { + type AgentSessionOptions, + type InternalSessionOptions, + type TurnDetectionMode, + type VoiceOptions, +} from '../agent_session.js'; +import { defaultEndpointingOptions } from './endpointing.js'; +import { defaultInterruptionOptions } from './interruption.js'; +import { type TurnHandlingOptions, defaultTurnHandlingOptions } from './turn_handling.js'; + +const defaultSessionOptions = { + maxToolSteps: 3, + preemptiveGeneration: true, + userAwayTimeout: 15.0, + aecWarmupDuration: 3000, + turnHandling: {}, + useTtsAlignedTranscript: true, +} as const satisfies AgentSessionOptions; + +const defaultLegacyVoiceOptions: VoiceOptions = { + minEndpointingDelay: defaultTurnHandlingOptions.endpointing.minDelay, + maxEndpointingDelay: defaultTurnHandlingOptions.endpointing.maxDelay, + maxToolSteps: defaultSessionOptions.maxToolSteps, + preemptiveGeneration: defaultSessionOptions.preemptiveGeneration, +}; + +export function migrateLegacyOptions(legacyOptions: AgentSessionOptions): { + agentSessionOptions: InternalSessionOptions; + legacyVoiceOptions: VoiceOptions; +} { + const logger = log(); + const { + voiceOptions, + turnDetection, + stt, + vad, + llm, + tts, + userData, + connOptions, + ...sessionOptions + } = legacyOptions; + + if (voiceOptions !== undefined) { + logger.warn( + 'voiceOptions is deprecated, use top-level SessionOptions fields on AgentSessionOptions instead', + ); + } + + const turnHandling: TurnHandlingOptions = { + interruption: { + discardAudioIfUninterruptible: voiceOptions?.discardAudioIfUninterruptible, + minDuration: voiceOptions?.minInterruptionDuration, + minWords: voiceOptions?.minInterruptionWords, + ...sessionOptions.turnHandling?.interruption, + }, + endpointing: { + minDelay: voiceOptions?.minEndpointingDelay, + maxDelay: voiceOptions?.maxEndpointingDelay, + ...sessionOptions.turnHandling?.endpointing, + }, + + turnDetection: sessionOptions?.turnHandling?.turnDetection ?? turnDetection, + } as const; + + if ( + voiceOptions?.allowInterruptions === false && + turnHandling.interruption.enabled === undefined + ) { + turnHandling.interruption.enabled = false; + } + + const migratedVoiceOptions: AgentSessionOptions = {}; + + if (voiceOptions?.maxToolSteps !== undefined) { + migratedVoiceOptions.maxToolSteps = voiceOptions.maxToolSteps; + } + if (voiceOptions?.preemptiveGeneration !== undefined) { + migratedVoiceOptions.preemptiveGeneration = voiceOptions.preemptiveGeneration; + } + if (voiceOptions?.userAwayTimeout !== undefined) { + migratedVoiceOptions.userAwayTimeout = voiceOptions.userAwayTimeout; + } + + const legacyVoiceOptions = { ...defaultLegacyVoiceOptions, ...voiceOptions }; + + const agentSessionOptions = { + stt, + vad, + llm, + tts, + userData, + connOptions, + ...defaultSessionOptions, + ...migratedVoiceOptions, + ...sessionOptions, + turnHandling: mergeWithDefaults(turnHandling), + // repopulate the deprecated voice options with migrated options for backwards compatibility + voiceOptions: legacyVoiceOptions, + }; + + return { agentSessionOptions, legacyVoiceOptions }; +} + +/** Remove keys whose value is `undefined` so they don't shadow defaults when spread. */ +export function stripUndefined(obj: T): Partial { + return Object.fromEntries(Object.entries(obj).filter(([, v]) => v !== undefined)) as Partial; +} + +export function mergeWithDefaults(config: TurnHandlingOptions) { + return { + turnDetection: config.turnDetection ?? defaultTurnHandlingOptions.turnDetection, + endpointing: { ...defaultEndpointingOptions, ...stripUndefined(config.endpointing) }, + interruption: { ...defaultInterruptionOptions, ...stripUndefined(config.interruption) }, + } as const; +} + +/** + * Build a partial {@link TurnHandlingOptions} from deprecated Agent constructor fields. + * Mirrors the Python Agent compatibility path, but keeps the JS API surface explicit. + */ +export function migrateTurnHandling(opts: { + turnDetection?: TurnDetectionMode; + allowInterruptions?: boolean; + minEndpointingDelay?: number; + maxEndpointingDelay?: number; + turnHandling?: TurnHandlingOptions; +}): Partial { + if (opts.turnHandling !== undefined) { + return opts.turnHandling; + } + + const migrated: Partial = {}; + + const endpointing: Partial = {}; + if (opts.minEndpointingDelay !== undefined) { + endpointing.minDelay = opts.minEndpointingDelay; + } + if (opts.maxEndpointingDelay !== undefined) { + endpointing.maxDelay = opts.maxEndpointingDelay; + } + if (Object.keys(endpointing).length > 0) { + migrated.endpointing = endpointing; + } + + const interruption: Partial = {}; + if (opts.allowInterruptions === false) { + interruption.enabled = false; + } + if (Object.keys(interruption).length > 0) { + migrated.interruption = interruption; + } + + if (opts.turnDetection !== undefined) { + migrated.turnDetection = opts.turnDetection; + } + + return { + ...(migrated.endpointing ? { endpointing: migrated.endpointing } : {}), + ...(migrated.interruption ? { interruption: migrated.interruption } : {}), + ...(migrated.turnDetection !== undefined ? { turnDetection: migrated.turnDetection } : {}), + }; +} diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index ac2512b2c..b85d633ac 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -9,6 +9,7 @@ import { defineAgent, inference, llm, + log, metrics, voice, } from '@livekit/agents'; @@ -39,7 +40,12 @@ export default defineAgent({ }, }); + const logger = log(); + const session = new voice.AgentSession({ + // VAD and turn detection are used to determine when the user is speaking and when the agent should respond + // See more at https://docs.livekit.io/agents/build/turns + vad: ctx.proc.userData.vad! as silero.VAD, // Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand // See all available models at https://docs.livekit.io/agents/models/stt/ stt: new inference.STT({ @@ -61,18 +67,17 @@ export default defineAgent({ 'rime/arcana', ], }), - // VAD and turn detection are used to determine when the user is speaking and when the agent should respond - // See more at https://docs.livekit.io/agents/build/turns - vad: ctx.proc.userData.vad! as silero.VAD, - turnDetection: new livekit.turnDetector.MultilingualModel(), - // to use realtime model, replace the stt, llm, tts and vad with the following - // llm: new openai.realtime.RealtimeModel(), - voiceOptions: { - // allow the LLM to generate a response while waiting for the end of turn - preemptiveGeneration: true, - useTtsAlignedTranscript: true, - aecWarmupDuration: 3000, + preemptiveGeneration: true, + turnHandling: { + turnDetection: new livekit.turnDetector.MultilingualModel(), + interruption: { + resumeFalseInterruption: true, + falseInterruptionTimeout: 1, + mode: 'adaptive', + }, }, + useTtsAlignedTranscript: true, + aecWarmupDuration: 3000, connOptions: { // Example of overriding the default connection options for the LLM/TTS/STT llmConnOptions: { @@ -83,11 +88,23 @@ export default defineAgent({ }, }); - const usageCollector = new metrics.UsageCollector(); - + // Log metrics as they are emitted session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); + }); + + // Log usage summary when job shuts down + ctx.addShutdownCallback(async () => { + logger.info( + { + usage: session.usage, + }, + 'Session usage summary', + ); + }); + + session.on(voice.AgentSessionEventTypes.OverlappingSpeech, (ev) => { + logger.warn({ type: ev.type, isInterruption: ev.isInterruption }, 'user overlapping speech'); }); await session.start({ diff --git a/examples/src/bey_avatar.ts b/examples/src/bey_avatar.ts index f8eb1f3d1..5cad7655c 100644 --- a/examples/src/bey_avatar.ts +++ b/examples/src/bey_avatar.ts @@ -1,7 +1,15 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { type JobContext, WorkerOptions, cli, defineAgent, metrics, voice } from '@livekit/agents'; +import { + type JobContext, + WorkerOptions, + cli, + defineAgent, + log, + metrics, + voice, +} from '@livekit/agents'; import * as bey from '@livekit/agents-plugin-bey'; import * as openai from '@livekit/agents-plugin-openai'; import { fileURLToPath } from 'node:url'; @@ -12,6 +20,7 @@ export default defineAgent({ instructions: 'You are a helpful assistant. Speak clearly and concisely.', }); + const logger = log(); const session = new voice.AgentSession({ llm: new openai.realtime.RealtimeModel({ voice: 'alloy', @@ -32,11 +41,19 @@ export default defineAgent({ }); await avatar.start(session, ctx.room); - const usageCollector = new metrics.UsageCollector(); - + // Log metrics as they are emitted (session.usage is automatically collected) session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); + }); + + // Log usage summary when job shuts down + ctx.addShutdownCallback(async () => { + logger.info( + { + usage: session.usage, + }, + 'Session usage summary', + ); }); session.generateReply({ diff --git a/examples/src/cartesia_tts.ts b/examples/src/cartesia_tts.ts index a11aae33a..4d40b7334 100644 --- a/examples/src/cartesia_tts.ts +++ b/examples/src/cartesia_tts.ts @@ -7,6 +7,7 @@ import { WorkerOptions, cli, defineAgent, + log, metrics, voice, } from '@livekit/agents'; @@ -28,6 +29,7 @@ export default defineAgent({ "You are a helpful assistant, you can hear the user's message and respond to it.", }); + const logger = log(); const vad = ctx.proc.userData.vad! as silero.VAD; const session = new voice.AgentSession({ @@ -40,11 +42,19 @@ export default defineAgent({ turnDetection: new livekit.turnDetector.MultilingualModel(), }); - const usageCollector = new metrics.UsageCollector(); - + // Log metrics as they are emitted (session.usage is automatically collected) session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); + }); + + // Log usage summary when job shuts down + ctx.addShutdownCallback(async () => { + logger.info( + { + usage: session.usage, + }, + 'Session usage summary', + ); }); await session.start({ diff --git a/examples/src/comprehensive_test.ts b/examples/src/comprehensive_test.ts index b6d08d6cd..bac9910cc 100644 --- a/examples/src/comprehensive_test.ts +++ b/examples/src/comprehensive_test.ts @@ -8,6 +8,7 @@ import { cli, defineAgent, llm, + log, metrics, voice, } from '@livekit/agents'; @@ -238,6 +239,7 @@ export default defineAgent({ proc.userData.vad = await silero.VAD.load(); }, entry: async (ctx: JobContext) => { + const logger = log(); const vad = ctx.proc.userData.vad! as silero.VAD; const session = new voice.AgentSession({ vad, @@ -249,11 +251,19 @@ export default defineAgent({ testedRealtimeLlmChoices: new Set(), }, }); - const usageCollector = new metrics.UsageCollector(); - + // Log metrics as they are emitted (session.usage is automatically collected) session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); + }); + + // Log usage summary when job shuts down + ctx.addShutdownCallback(async () => { + logger.info( + { + usage: session.usage, + }, + 'Session usage summary', + ); }); await session.start({ diff --git a/examples/src/hedra/hedra_avatar.ts b/examples/src/hedra/hedra_avatar.ts index 38c103d66..9bead6094 100644 --- a/examples/src/hedra/hedra_avatar.ts +++ b/examples/src/hedra/hedra_avatar.ts @@ -9,6 +9,7 @@ import { defineAgent, inference, initializeLogger, + log, metrics, voice, } from '@livekit/agents'; @@ -33,6 +34,7 @@ export default defineAgent({ instructions: 'You are a helpful assistant. Speak clearly and concisely.', }); + const logger = log(); const session = new voice.AgentSession({ stt: new inference.STT({ model: 'deepgram/nova-3', @@ -68,11 +70,19 @@ export default defineAgent({ }); await avatar.start(session, ctx.room); - const usageCollector = new metrics.UsageCollector(); - + // Log metrics as they are emitted (session.usage is automatically collected) session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); + }); + + // Log usage summary when job shuts down + ctx.addShutdownCallback(async () => { + logger.info( + { + usage: session.usage, + }, + 'Session usage summary', + ); }); session.generateReply({ diff --git a/examples/src/inworld_tts.ts b/examples/src/inworld_tts.ts index fec5c552d..45bb6961c 100644 --- a/examples/src/inworld_tts.ts +++ b/examples/src/inworld_tts.ts @@ -7,6 +7,7 @@ import { WorkerOptions, cli, defineAgent, + log, metrics, voice, } from '@livekit/agents'; @@ -26,6 +27,7 @@ export default defineAgent({ "You are a helpful assistant, you can hear the user's message and respond to it in 1-2 short sentences.", }); + const logger = log(); // Create TTS instance const tts = new inworld.TTS({ timestampType: 'WORD', @@ -96,11 +98,19 @@ export default defineAgent({ } }); - const usageCollector = new metrics.UsageCollector(); - + // Log metrics as they are emitted (session.usage is automatically collected) session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => { metrics.logMetrics(ev.metrics); - usageCollector.collect(ev.metrics); + }); + + // Log usage summary when job shuts down + ctx.addShutdownCallback(async () => { + logger.info( + { + usage: session.usage, + }, + 'Session usage summary', + ); }); await session.start({ diff --git a/package.json b/package.json index 7db46fed8..800baf185 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,7 @@ "doc": "typedoc && mkdir -p docs/assets/github && cp .github/*.png docs/assets/github/ && find docs -name '*.html' -type f -exec sed -i.bak 's|=\"/.github/|=\"assets/github/|g' {} + && find docs -name '*.bak' -delete" }, "devDependencies": { - "@changesets/cli": "^2.29.6", + "@changesets/cli": "^2.30.0", "@livekit/changesets-changelog-github": "^0.0.4", "@rushstack/heft": "^0.66.0", "@trivago/prettier-plugin-sort-imports": "^4.3.0", diff --git a/patches/@changesets__assemble-release-plan.patch b/patches/@changesets__assemble-release-plan.patch index 5d0ce7e8e..6a62d6b58 100644 --- a/patches/@changesets__assemble-release-plan.patch +++ b/patches/@changesets__assemble-release-plan.patch @@ -1,32 +1,42 @@ diff --git a/dist/changesets-assemble-release-plan.cjs.js b/dist/changesets-assemble-release-plan.cjs.js -index e07ba6e793021b6cfdec898afca517e293386ddb..a60de932d8521d241e5c9f272b4e4de93e6f09db 100644 +index e07ba6e793021b6cfdec898afca517e293386ddb..c286051e952d78d4dc6cda5832ba4226d63295c5 100644 --- a/dist/changesets-assemble-release-plan.cjs.js +++ b/dist/changesets-assemble-release-plan.cjs.js -@@ -317,6 +317,12 @@ function shouldBumpMajor({ +@@ -317,6 +317,9 @@ function shouldBumpMajor({ preInfo, onlyUpdatePeerDependentsWhenOutOfRange }) { -+ -+ // Temporary fix for: https://github.com/changesets/changesets/pull/1132 + if (depType === "peerDependencies") { + return false; + } -+ // we check if it is a peerDependency because if it is, our dependent bump type might need to be major. return depType === "peerDependencies" && nextRelease.type !== "none" && nextRelease.type !== "patch" && ( // 1. If onlyUpdatePeerDependentsWhenOutOfRange set to true, bump major if the version is leaving the range. diff --git a/dist/changesets-assemble-release-plan.esm.js b/dist/changesets-assemble-release-plan.esm.js -index ea2be567403c4ef94a65f3218ccb683cf5cb4bc1..fba07a45efee5d46d6ecfc3587f7a967a95df6f2 100644 +index ea2be567403c4ef94a65f3218ccb683cf5cb4bc1..0f5ac121649f1ac0a2b2e9704079474e360761e9 100644 --- a/dist/changesets-assemble-release-plan.esm.js +++ b/dist/changesets-assemble-release-plan.esm.js -@@ -306,6 +306,10 @@ function shouldBumpMajor({ +@@ -306,6 +306,9 @@ function shouldBumpMajor({ preInfo, onlyUpdatePeerDependentsWhenOutOfRange }) { -+ // Temporary fix for: https://github.com/changesets/changesets/pull/1132 + if (depType === "peerDependencies") { + return false; + } // we check if it is a peerDependency because if it is, our dependent bump type might need to be major. return depType === "peerDependencies" && nextRelease.type !== "none" && nextRelease.type !== "patch" && ( // 1. If onlyUpdatePeerDependentsWhenOutOfRange set to true, bump major if the version is leaving the range. +diff --git a/src/determine-dependents.ts b/src/determine-dependents.ts +index 47e32b6f09310d3ed03ab46b2d342d032f9014df..4edccca93152b1c8df59391b35e5f72de7ab8c05 100644 +--- a/src/determine-dependents.ts ++++ b/src/determine-dependents.ts +@@ -230,6 +230,9 @@ function shouldBumpMajor({ + preInfo: PreInfo | undefined; + onlyUpdatePeerDependentsWhenOutOfRange: boolean; + }) { ++ if (depType === "peerDependencies") { ++ return false; ++ } + // we check if it is a peerDependency because if it is, our dependent bump type might need to be major. + return ( + depType === "peerDependencies" && diff --git a/plugins/cartesia/src/tts.ts b/plugins/cartesia/src/tts.ts index f30da3083..867f730c9 100644 --- a/plugins/cartesia/src/tts.ts +++ b/plugins/cartesia/src/tts.ts @@ -126,6 +126,14 @@ export class TTS extends tts.TTS { #opts: TTSOptions; label = 'cartesia.TTS'; + get model(): string { + return this.#opts.model; + } + + get provider(): string { + return 'Cartesia'; + } + constructor(opts: Partial = {}) { const resolvedOpts = { ...defaultTTSOptions, diff --git a/plugins/deepgram/src/stt.ts b/plugins/deepgram/src/stt.ts index 7ffc5142c..0f10ee331 100644 --- a/plugins/deepgram/src/stt.ts +++ b/plugins/deepgram/src/stt.ts @@ -72,6 +72,14 @@ export class STT extends stt.STT { label = 'deepgram.STT'; private abortController = new AbortController(); + get model(): string { + return this.#opts.model; + } + + get provider(): string { + return 'Deepgram'; + } + constructor(opts: Partial = defaultSTTOptions) { super({ streaming: true, diff --git a/plugins/deepgram/src/tts.ts b/plugins/deepgram/src/tts.ts index 5e9aceb30..6c6c2ff98 100644 --- a/plugins/deepgram/src/tts.ts +++ b/plugins/deepgram/src/tts.ts @@ -46,6 +46,14 @@ export class TTS extends tts.TTS { private opts: TTSOptions; label = 'deepgram.TTS'; + get model(): string { + return this.opts.model; + } + + get provider(): string { + return 'Deepgram'; + } + constructor(opts: Partial = {}) { super(opts.sampleRate || defaultTTSOptions.sampleRate, NUM_CHANNELS, { streaming: opts.capabilities?.streaming ?? defaultTTSOptions.capabilities.streaming, diff --git a/plugins/google/src/llm.ts b/plugins/google/src/llm.ts index 7b9d4b4ac..cea76ac52 100644 --- a/plugins/google/src/llm.ts +++ b/plugins/google/src/llm.ts @@ -51,6 +51,13 @@ export class LLM extends llm.LLM { return this.#opts.model; } + get provider(): string { + if (this.#opts.vertexai) { + return 'Vertex AI'; + } + return 'Gemini'; + } + /** * Create a new instance of Google GenAI LLM. * diff --git a/plugins/livekit/src/turn_detector/base.ts b/plugins/livekit/src/turn_detector/base.ts index cf425e54c..da2cf8197 100644 --- a/plugins/livekit/src/turn_detector/base.ts +++ b/plugins/livekit/src/turn_detector/base.ts @@ -170,6 +170,14 @@ export abstract class EOUModel { #logger = log(); + get model(): string { + return MODEL_REVISIONS[this.modelType]; + } + + get provider(): string { + return 'livekit'; + } + constructor(opts: EOUModelOptions) { const { modelType = 'en', diff --git a/plugins/openai/src/llm.ts b/plugins/openai/src/llm.ts index 22299344a..f4d055506 100644 --- a/plugins/openai/src/llm.ts +++ b/plugins/openai/src/llm.ts @@ -86,6 +86,15 @@ export class LLM extends llm.LLM { return this.#opts.model; } + get provider(): string { + try { + const url = new URL(this.#client.baseURL); + return url.host; + } catch { + return 'api.openai.com'; + } + } + /** * Create a new instance of OpenAI LLM with Azure. * diff --git a/plugins/openai/src/realtime/realtime_model.ts b/plugins/openai/src/realtime/realtime_model.ts index 3623d5d5d..e08e95dbb 100644 --- a/plugins/openai/src/realtime/realtime_model.ts +++ b/plugins/openai/src/realtime/realtime_model.ts @@ -144,6 +144,15 @@ export class RealtimeModel extends llm.RealtimeModel { return this._options.model; } + get provider(): string { + try { + const url = new URL(this._options.baseURL); + return url.host; + } catch { + return 'api.openai.com'; + } + } + constructor( options: { model?: string; diff --git a/plugins/openai/src/stt.ts b/plugins/openai/src/stt.ts index 076e462b7..14c7a3779 100644 --- a/plugins/openai/src/stt.ts +++ b/plugins/openai/src/stt.ts @@ -28,6 +28,19 @@ export class STT extends stt.STT { #client: OpenAI; label = 'openai.STT'; + get model(): string { + return this.#opts.model; + } + + get provider(): string { + try { + const url = new URL(this.#client.baseURL); + return url.host; + } catch { + return 'api.openai.com'; + } + } + /** * Create a new instance of OpenAI STT. * diff --git a/plugins/openai/src/tts.ts b/plugins/openai/src/tts.ts index 2bb77c3d5..3bce9d501 100644 --- a/plugins/openai/src/tts.ts +++ b/plugins/openai/src/tts.ts @@ -32,6 +32,19 @@ export class TTS extends tts.TTS { label = 'openai.TTS'; private abortController = new AbortController(); + get model(): string { + return this.#opts.model; + } + + get provider(): string { + try { + const url = new URL(this.#client.baseURL); + return url.host; + } catch { + return 'api.openai.com'; + } + } + /** * Create a new instance of OpenAI TTS. * diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 674e7b9b0..0350aa94d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -12,7 +12,7 @@ catalogs: patchedDependencies: '@changesets/assemble-release-plan': - hash: wy7r54pxmuo7fdh5qymk2adrlq + hash: gqgxtqs6uc7rjtxf53tdm2jjpi path: patches/@changesets__assemble-release-plan.patch importers: @@ -20,8 +20,8 @@ importers: .: devDependencies: '@changesets/cli': - specifier: ^2.29.6 - version: 2.29.6(@types/node@22.15.30) + specifier: ^2.30.0 + version: 2.30.0(@types/node@22.15.30) '@livekit/changesets-changelog-github': specifier: ^0.0.4 version: 0.0.4 @@ -100,6 +100,9 @@ importers: agents: dependencies: + '@bufbuild/protobuf': + specifier: ^1.10.0 + version: 1.10.1 '@ffmpeg-installer/ffmpeg': specifier: ^1.1.0 version: 1.1.0 @@ -107,8 +110,8 @@ importers: specifier: ^1.1.1 version: 1.1.1 '@livekit/protocol': - specifier: ^1.43.0 - version: 1.43.0 + specifier: ^1.45.1 + version: 1.45.1 '@livekit/typed-emitter': specifier: ^3.0.0 version: 3.0.0 @@ -169,6 +172,9 @@ importers: livekit-server-sdk: specifier: ^2.14.1 version: 2.14.1 + ofetch: + specifier: ^1.5.1 + version: 1.5.1 openai: specifier: ^6.8.1 version: 6.8.1(ws@8.18.3)(zod@3.25.76) @@ -939,29 +945,29 @@ importers: version: 3.25.76 plugins/trugen: - dependencies: - livekit-server-sdk: - specifier: ^2.13.3 - version: 2.13.3 - devDependencies: - '@livekit/agents': - specifier: workspace:* - version: link:../../agents - '@livekit/rtc-node': - specifier: 'catalog:' - version: 0.13.24 - '@microsoft/api-extractor': - specifier: ^7.35.0 - version: 7.43.7(@types/node@22.19.1) - pino: - specifier: ^8.19.0 - version: 8.21.0 - tsup: - specifier: ^8.3.5 - version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@22.19.1))(postcss@8.5.6)(tsx@4.21.0)(typescript@5.4.5) - typescript: - specifier: ^5.0.0 - version: 5.4.5 + dependencies: + livekit-server-sdk: + specifier: ^2.13.3 + version: 2.13.3 + devDependencies: + '@livekit/agents': + specifier: workspace:* + version: link:../../agents + '@livekit/rtc-node': + specifier: 'catalog:' + version: 0.13.24 + '@microsoft/api-extractor': + specifier: ^7.35.0 + version: 7.43.7(@types/node@22.19.1) + pino: + specifier: ^8.19.0 + version: 8.21.0 + tsup: + specifier: ^8.3.5 + version: 8.4.0(@microsoft/api-extractor@7.43.7(@types/node@22.19.1))(postcss@8.5.6)(tsx@4.21.0)(typescript@5.4.5) + typescript: + specifier: ^5.0.0 + version: 5.4.5 plugins/xai: dependencies: @@ -1079,8 +1085,8 @@ packages: '@bufbuild/protobuf@1.10.1': resolution: {integrity: sha512-wJ8ReQbHxsAfXhrf9ixl0aYbZorRuOWpBNzm8pL8ftmSxQx/wnJD5Eg861NwJU/czy2VXFIebCeZnZrI9rktIQ==} - '@changesets/apply-release-plan@7.0.12': - resolution: {integrity: sha512-EaET7As5CeuhTzvXTQCRZeBUcisoYPDDcXvgTE/2jmmypKp0RC7LxKj/yzqeh/1qFTZI7oDGFcL1PHRuQuketQ==} + '@changesets/apply-release-plan@7.1.0': + resolution: {integrity: sha512-yq8ML3YS7koKQ/9bk1PqO0HMzApIFNwjlwCnwFEXMzNe8NpzeeYYKCmnhWJGkN8g7E51MnWaSbqRcTcdIxUgnQ==} '@changesets/assemble-release-plan@6.0.9': resolution: {integrity: sha512-tPgeeqCHIwNo8sypKlS3gOPmsS3wP0zHt67JDuL20P4QcXiw/O4Hl7oXiuLnP9yg+rXLQ2sScdV1Kkzde61iSQ==} @@ -1088,12 +1094,12 @@ packages: '@changesets/changelog-git@0.2.1': resolution: {integrity: sha512-x/xEleCFLH28c3bQeQIyeZf8lFXyDFVn1SgcBiR2Tw/r4IAWlk1fzxCEZ6NxQAjF2Nwtczoen3OA2qR+UawQ8Q==} - '@changesets/cli@2.29.6': - resolution: {integrity: sha512-6qCcVsIG1KQLhpQ5zE8N0PckIx4+9QlHK3z6/lwKnw7Tir71Bjw8BeOZaxA/4Jt00pcgCnCSWZnyuZf5Il05QQ==} + '@changesets/cli@2.30.0': + resolution: {integrity: sha512-5D3Nk2JPqMI1wK25pEymeWRSlSMdo5QOGlyfrKg0AOufrUcjEE3RQgaCpHoBiM31CSNrtSgdJ0U6zL1rLDDfBA==} hasBin: true - '@changesets/config@3.1.1': - resolution: {integrity: sha512-bd+3Ap2TKXxljCggI0mKPfzCQKeV/TU4yO2h2C6vAihIo8tzseAn2e7klSuiyYYXvgu53zMN1OeYMIQkaQoWnA==} + '@changesets/config@3.1.3': + resolution: {integrity: sha512-vnXjcey8YgBn2L1OPWd3ORs0bGC4LoYcK/ubpgvzNVr53JXV5GiTVj7fWdMRsoKUH7hhhMAQnsJUqLr21EncNw==} '@changesets/errors@0.2.0': resolution: {integrity: sha512-6BLOQUscTpZeGljvyQXlWOItQyU71kCdGz7Pi8H8zdw6BI0g3m43iL4xKUVPWtG+qrrL9DTjpdn8eYuCQSRpow==} @@ -1104,8 +1110,8 @@ packages: '@changesets/get-github-info@0.5.2': resolution: {integrity: sha512-JppheLu7S114aEs157fOZDjFqUDpm7eHdq5E8SSR0gUBTEK0cNSHsrSR5a66xs0z3RWuo46QvA3vawp8BxDHvg==} - '@changesets/get-release-plan@4.0.13': - resolution: {integrity: sha512-DWG1pus72FcNeXkM12tx+xtExyH/c9I1z+2aXlObH3i9YA7+WZEVaiHzHl03thpvAgWTRaH64MpfHxozfF7Dvg==} + '@changesets/get-release-plan@4.0.15': + resolution: {integrity: sha512-Q04ZaRPuEVZtA+auOYgFaVQQSA98dXiVe/yFaZfY7hoSmQICHGvP0TF4u3EDNHWmmCS4ekA/XSpKlSM2PyTS2g==} '@changesets/get-version-range-type@0.4.0': resolution: {integrity: sha512-hwawtob9DryoGTpixy1D3ZXbGgJu1Rhr+ySH2PvTLHvkZuQ7sRT4oQwMh0hbqZH1weAooedEjRsbrWcGLCeyVQ==} @@ -1116,14 +1122,14 @@ packages: '@changesets/logger@0.1.1': resolution: {integrity: sha512-OQtR36ZlnuTxKqoW4Sv6x5YIhOmClRd5pWsjZsddYxpWs517R0HkyiefQPIytCVh4ZcC5x9XaG8KTdd5iRQUfg==} - '@changesets/parse@0.4.1': - resolution: {integrity: sha512-iwksMs5Bf/wUItfcg+OXrEpravm5rEd9Bf4oyIPL4kVTmJQ7PNDSd6MDYkpSJR1pn7tz/k8Zf2DhTCqX08Ou+Q==} + '@changesets/parse@0.4.3': + resolution: {integrity: sha512-ZDmNc53+dXdWEv7fqIUSgRQOLYoUom5Z40gmLgmATmYR9NbL6FJJHwakcCpzaeCy+1D0m0n7mT4jj2B/MQPl7A==} '@changesets/pre@2.0.2': resolution: {integrity: sha512-HaL/gEyFVvkf9KFg6484wR9s0qjAXlZ8qWPDkTyKF6+zqjBe/I2mygg3MbpZ++hdi0ToqNUF8cjj7fBy0dg8Ug==} - '@changesets/read@0.6.5': - resolution: {integrity: sha512-UPzNGhsSjHD3Veb0xO/MwvasGe8eMyNrR/sT9gR8Q3DhOQZirgKhhXv/8hVsI0QpPjR004Z9iFxoJU6in3uGMg==} + '@changesets/read@0.6.7': + resolution: {integrity: sha512-D1G4AUYGrBEk8vj8MGwf75k9GpN6XL3wg8i42P2jZZwFLXnlr2Pn7r9yuQNbaMCarP7ZQWNJbV6XLeysAIMhTA==} '@changesets/should-skip-package@0.1.2': resolution: {integrity: sha512-qAK/WrqWLNCP22UDdBTMPH5f41elVDlsNyat180A33dWxuUDyNpg6fPi/FyTZwRriVjg0L8gnjJn2F9XAoF0qw==} @@ -1969,8 +1975,8 @@ packages: cpu: [x64] os: [win32] - '@inquirer/external-editor@1.0.1': - resolution: {integrity: sha512-Oau4yL24d2B5IL4ma4UpbQigkVhzPDXLoqy1ggK4gnHg/stmkffJE4oOXHXF3uz0UEpywG68KcyXsyYpA1Re/Q==} + '@inquirer/external-editor@1.0.3': + resolution: {integrity: sha512-RWbSrDiYmO4LbejWY7ttpxczuwQyZLBUyygsA9Nsv95hpzUWwnNTVQmAq3xuh7vNwCp07UTmE5i11XAEExx4RA==} engines: {node: '>=18'} peerDependencies: '@types/node': '>=18' @@ -2051,8 +2057,8 @@ packages: cpu: [x64] os: [win32] - '@livekit/protocol@1.43.0': - resolution: {integrity: sha512-WCJ97fa4CBqPDh8pzdszOm/2xmelJ3Dx2vjKBlyb9BzmPQx1LjzVciP6uYFFMCMdrq2l1mjFQBXEz8Z20UCkyw==} + '@livekit/protocol@1.45.1': + resolution: {integrity: sha512-sr6p0TwKofHO5KW6kUzjq4hH2de4Al5scQo824xFnyI1XYo0qQn6fTG+bdr+Uj4EedjYAOqjezwUju5OErVIRA==} '@livekit/rtc-node-darwin-arm64@0.13.24': resolution: {integrity: sha512-gm5xOpGu6Rj/mNU2jEijcGhQGN2GdxV2dNYQm3NCKN7ow0BmMFZvXSCAWOWf+9oTutPXHnrc7EN1mt2v+lfqhA==} @@ -2966,8 +2972,8 @@ packages: resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} engines: {node: '>=10'} - chardet@2.1.0: - resolution: {integrity: sha512-bNFETTG/pM5ryzQ9Ad0lJOTa6HWD/YsScAR3EnCPZRPlQh77JocYktSHOUHelyhm8IARL+o4c4F1bP5KVOjiRA==} + chardet@2.1.1: + resolution: {integrity: sha512-PsezH1rqdV9VvyNhxxOW32/d75r01NY7TQCmOqomRo15ZSOKbpTFVsfjghxo6JloQUCGnH4k1LGu0R4yCLlWQQ==} check-error@1.0.3: resolution: {integrity: sha512-iKEoDYaRmd1mxM90a2OEfWhjsjPpYPuQ+lMYsoxB126+t8fw7ySEO48nmDg5COTjxDI65/Y2OWpeEHk3ZOe8zg==} @@ -2984,10 +2990,6 @@ packages: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} - ci-info@3.9.0: - resolution: {integrity: sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ==} - engines: {node: '>=8'} - cjs-module-lexer@1.4.3: resolution: {integrity: sha512-9z8TZaGM1pfswYeXrUpzPrkx8UnWYdhJclsiYMm6x/w5+nN+8Tf/LnAgfLGQCm59qAOxU8WwHEq2vNwF6i4j+Q==} @@ -3119,6 +3121,9 @@ packages: resolution: {integrity: sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==} engines: {node: '>=6'} + destr@2.0.5: + resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} + detect-indent@6.1.0: resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} engines: {node: '>=8'} @@ -3764,8 +3769,8 @@ packages: resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==} engines: {node: '>=16.17.0'} - iconv-lite@0.6.3: - resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==} + iconv-lite@0.7.2: + resolution: {integrity: sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==} engines: {node: '>=0.10.0'} ieee754@1.2.1: @@ -3983,6 +3988,10 @@ packages: resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==} hasBin: true + js-yaml@4.1.1: + resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==} + hasBin: true + jsesc@2.5.2: resolution: {integrity: sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==} engines: {node: '>=4'} @@ -4250,6 +4259,9 @@ packages: engines: {node: '>=10.5.0'} deprecated: Use your platform's native DOMException instead + node-fetch-native@1.6.7: + resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==} + node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -4305,6 +4317,9 @@ packages: obug@2.1.1: resolution: {integrity: sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==} + ofetch@1.5.1: + resolution: {integrity: sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA==} + on-exit-leak-free@2.1.2: resolution: {integrity: sha512-0eJJY6hXLGf1udHwfNftBqH+g73EU4B504nZeKpz1sYRKafAghwxEJunB2O7rDZkL4PGfsMVnTXZ2EjibbqcsA==} engines: {node: '>=14.0.0'} @@ -4721,11 +4736,6 @@ packages: engines: {node: '>=10'} hasBin: true - semver@7.6.3: - resolution: {integrity: sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==} - engines: {node: '>=10'} - hasBin: true - semver@7.7.2: resolution: {integrity: sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==} engines: {node: '>=10'} @@ -5154,6 +5164,9 @@ packages: ufo@1.5.3: resolution: {integrity: sha512-Y7HYmWaFwPUmkoQCUIAYpKqkOf+SbVj/2fJJZ4RJMCfZp0rTGwRbzQD+HghfnhKOjL9E01okqz+ncJskGYfBNw==} + ufo@1.6.3: + resolution: {integrity: sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q==} + unbox-primitive@1.0.2: resolution: {integrity: sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw==} @@ -5576,9 +5589,9 @@ snapshots: '@bufbuild/protobuf@1.10.1': {} - '@changesets/apply-release-plan@7.0.12': + '@changesets/apply-release-plan@7.1.0': dependencies: - '@changesets/config': 3.1.1 + '@changesets/config': 3.1.3 '@changesets/get-version-range-type': 0.4.0 '@changesets/git': 3.0.4 '@changesets/should-skip-package': 0.1.2 @@ -5592,7 +5605,7 @@ snapshots: resolve-from: 5.0.0 semver: 7.7.3 - '@changesets/assemble-release-plan@6.0.9(patch_hash=wy7r54pxmuo7fdh5qymk2adrlq)': + '@changesets/assemble-release-plan@6.0.9(patch_hash=gqgxtqs6uc7rjtxf53tdm2jjpi)': dependencies: '@changesets/errors': 0.2.0 '@changesets/get-dependents-graph': 2.1.3 @@ -5605,44 +5618,43 @@ snapshots: dependencies: '@changesets/types': 6.1.0 - '@changesets/cli@2.29.6(@types/node@22.15.30)': + '@changesets/cli@2.30.0(@types/node@22.15.30)': dependencies: - '@changesets/apply-release-plan': 7.0.12 - '@changesets/assemble-release-plan': 6.0.9(patch_hash=wy7r54pxmuo7fdh5qymk2adrlq) + '@changesets/apply-release-plan': 7.1.0 + '@changesets/assemble-release-plan': 6.0.9(patch_hash=gqgxtqs6uc7rjtxf53tdm2jjpi) '@changesets/changelog-git': 0.2.1 - '@changesets/config': 3.1.1 + '@changesets/config': 3.1.3 '@changesets/errors': 0.2.0 '@changesets/get-dependents-graph': 2.1.3 - '@changesets/get-release-plan': 4.0.13 + '@changesets/get-release-plan': 4.0.15 '@changesets/git': 3.0.4 '@changesets/logger': 0.1.1 '@changesets/pre': 2.0.2 - '@changesets/read': 0.6.5 + '@changesets/read': 0.6.7 '@changesets/should-skip-package': 0.1.2 '@changesets/types': 6.1.0 '@changesets/write': 0.4.0 - '@inquirer/external-editor': 1.0.1(@types/node@22.15.30) + '@inquirer/external-editor': 1.0.3(@types/node@22.15.30) '@manypkg/get-packages': 1.1.3 ansi-colors: 4.1.3 - ci-info: 3.9.0 enquirer: 2.4.1 fs-extra: 7.0.1 mri: 1.2.0 - p-limit: 2.3.0 package-manager-detector: 0.2.11 picocolors: 1.1.1 resolve-from: 5.0.0 - semver: 7.6.3 + semver: 7.7.3 spawndamnit: 3.0.1 term-size: 2.2.1 transitivePeerDependencies: - '@types/node' - '@changesets/config@3.1.1': + '@changesets/config@3.1.3': dependencies: '@changesets/errors': 0.2.0 '@changesets/get-dependents-graph': 2.1.3 '@changesets/logger': 0.1.1 + '@changesets/should-skip-package': 0.1.2 '@changesets/types': 6.1.0 '@manypkg/get-packages': 1.1.3 fs-extra: 7.0.1 @@ -5666,12 +5678,12 @@ snapshots: transitivePeerDependencies: - encoding - '@changesets/get-release-plan@4.0.13': + '@changesets/get-release-plan@4.0.15': dependencies: - '@changesets/assemble-release-plan': 6.0.9(patch_hash=wy7r54pxmuo7fdh5qymk2adrlq) - '@changesets/config': 3.1.1 + '@changesets/assemble-release-plan': 6.0.9(patch_hash=gqgxtqs6uc7rjtxf53tdm2jjpi) + '@changesets/config': 3.1.3 '@changesets/pre': 2.0.2 - '@changesets/read': 0.6.5 + '@changesets/read': 0.6.7 '@changesets/types': 6.1.0 '@manypkg/get-packages': 1.1.3 @@ -5689,10 +5701,10 @@ snapshots: dependencies: picocolors: 1.1.1 - '@changesets/parse@0.4.1': + '@changesets/parse@0.4.3': dependencies: '@changesets/types': 6.1.0 - js-yaml: 3.14.1 + js-yaml: 4.1.1 '@changesets/pre@2.0.2': dependencies: @@ -5701,11 +5713,11 @@ snapshots: '@manypkg/get-packages': 1.1.3 fs-extra: 7.0.1 - '@changesets/read@0.6.5': + '@changesets/read@0.6.7': dependencies: '@changesets/git': 3.0.4 '@changesets/logger': 0.1.1 - '@changesets/parse': 0.4.1 + '@changesets/parse': 0.4.3 '@changesets/types': 6.1.0 fs-extra: 7.0.1 p-filter: 2.1.0 @@ -6222,10 +6234,10 @@ snapshots: '@img/sharp-win32-x64@0.34.5': optional: true - '@inquirer/external-editor@1.0.1(@types/node@22.15.30)': + '@inquirer/external-editor@1.0.3(@types/node@22.15.30)': dependencies: - chardet: 2.1.0 - iconv-lite: 0.6.3 + chardet: 2.1.1 + iconv-lite: 0.7.2 optionalDependencies: '@types/node': 22.15.30 @@ -6308,7 +6320,7 @@ snapshots: '@livekit/noise-cancellation-win32-x64@0.1.9': optional: true - '@livekit/protocol@1.43.0': + '@livekit/protocol@1.45.1': dependencies: '@bufbuild/protobuf': 1.10.1 @@ -7382,7 +7394,7 @@ snapshots: ansi-styles: 4.3.0 supports-color: 7.2.0 - chardet@2.1.0: {} + chardet@2.1.1: {} check-error@1.0.3: dependencies: @@ -7396,8 +7408,6 @@ snapshots: chownr@3.0.0: {} - ci-info@3.9.0: {} - cjs-module-lexer@1.4.3: {} color-convert@1.9.3: @@ -7505,6 +7515,8 @@ snapshots: dequal@2.0.3: {} + destr@2.0.5: {} + detect-indent@6.1.0: {} detect-libc@2.1.2: {} @@ -8398,7 +8410,7 @@ snapshots: human-signals@5.0.0: {} - iconv-lite@0.6.3: + iconv-lite@0.7.2: dependencies: safer-buffer: 2.1.2 @@ -8603,6 +8615,10 @@ snapshots: dependencies: argparse: 2.0.1 + js-yaml@4.1.1: + dependencies: + argparse: 2.0.1 + jsesc@2.5.2: {} json-bigint@1.0.0: @@ -8671,14 +8687,14 @@ snapshots: livekit-server-sdk@2.13.3: dependencies: '@bufbuild/protobuf': 1.10.1 - '@livekit/protocol': 1.43.0 + '@livekit/protocol': 1.45.1 camelcase-keys: 9.1.3 jose: 5.2.4 livekit-server-sdk@2.14.1: dependencies: '@bufbuild/protobuf': 1.10.1 - '@livekit/protocol': 1.43.0 + '@livekit/protocol': 1.45.1 camelcase-keys: 9.1.3 jose: 5.2.4 @@ -8841,6 +8857,8 @@ snapshots: node-domexception@1.0.0: {} + node-fetch-native@1.6.7: {} + node-fetch@2.7.0: dependencies: whatwg-url: 5.0.0 @@ -8901,6 +8919,12 @@ snapshots: obug@2.1.1: {} + ofetch@1.5.1: + dependencies: + destr: 2.0.5 + node-fetch-native: 1.6.7 + ufo: 1.6.3 + on-exit-leak-free@2.1.2: {} once@1.4.0: @@ -9382,8 +9406,6 @@ snapshots: dependencies: lru-cache: 6.0.0 - semver@7.6.3: {} - semver@7.7.2: {} semver@7.7.3: {} @@ -9892,6 +9914,8 @@ snapshots: ufo@1.5.3: {} + ufo@1.6.3: {} + unbox-primitive@1.0.2: dependencies: call-bind: 1.0.7 diff --git a/turbo.json b/turbo.json index e044f4fea..4f7d3a3a0 100644 --- a/turbo.json +++ b/turbo.json @@ -30,6 +30,7 @@ "LIVEKIT_API_SECRET", "LIVEKIT_INFERENCE_API_KEY", "LIVEKIT_INFERENCE_API_SECRET", + "LIVEKIT_DEV_MODE", "LIVEKIT_INFERENCE_URL", "LIVEKIT_URL", "LLAMA_API_KEY",