From 46b6c2cc9a3e50380f09c27f5e9f4134aa00d9f4 Mon Sep 17 00:00:00 2001 From: Vapi Tasker Date: Wed, 18 Mar 2026 14:41:24 +0000 Subject: [PATCH] fix: acquire getUserMedia early in start() to prevent mobile gesture timeout Mobile browsers enforce strict user gesture policies that only allow a short window (~1-5s) between a user tap and a getUserMedia() call. Previously, the SDK made an API call to create the web call BEFORE calling getUserMedia (via DailyIframe.createCallObject), causing NotAllowedError on mobile when the API call took too long. This fix calls getUserMedia() immediately when start() is invoked, within the user gesture window, before any async network calls. The pre-acquired audio track is then passed to DailyIframe.createCallObject. Also adds support for passing a pre-acquired MediaStream via the options.mediaStream parameter for callers who want even more control. VAP-12773 --- __tests__/vapi-early-media.test.ts | 256 +++++++++++++++++++++++++++++ vapi.ts | 119 +++++++++++++- 2 files changed, 366 insertions(+), 9 deletions(-) create mode 100644 __tests__/vapi-early-media.test.ts diff --git a/__tests__/vapi-early-media.test.ts b/__tests__/vapi-early-media.test.ts new file mode 100644 index 000000000..62220525e --- /dev/null +++ b/__tests__/vapi-early-media.test.ts @@ -0,0 +1,256 @@ +/** + * Tests for early getUserMedia acquisition in Vapi.start() + * + * Mobile browsers enforce strict "user gesture" policies that only allow + * a short window (~1-5s) between a user tap and a getUserMedia() call. + * The SDK must call getUserMedia() BEFORE making any network requests + * (e.g., the web call creation API call) to stay within that window. + * + * VAP-12773: https://linear.app/vapi/issue/VAP-12773 + */ + +// Track call ordering to verify getUserMedia runs before API call +let callOrder: string[] = []; + +// Mock MediaStreamTrack +const mockAudioTrack = { + kind: 'audio', + id: 'mock-audio-track-id', + enabled: true, + stop: jest.fn(), + addEventListener: jest.fn(), + removeEventListener: jest.fn(), +} as unknown as MediaStreamTrack; + +const mockMediaStream = { + getAudioTracks: () => [mockAudioTrack], + getTracks: () => [mockAudioTrack], +} as unknown as MediaStream; + +// Mock navigator.mediaDevices.getUserMedia +const mockGetUserMedia = jest.fn().mockImplementation(async () => { + callOrder.push('getUserMedia'); + return mockMediaStream; +}); + +// Set up global navigator mock +Object.defineProperty(global, 'navigator', { + value: { + mediaDevices: { + getUserMedia: mockGetUserMedia, + }, + userAgent: + 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15', + }, + writable: true, + configurable: true, +}); + +// Set up global document mock for audio player creation +Object.defineProperty(global, 'document', { + value: { + createElement: jest.fn().mockReturnValue({ + dataset: {}, + style: { setProperty: jest.fn() }, + play: jest.fn().mockResolvedValue(undefined), + muted: false, + autoplay: false, + srcObject: null, + }), + body: { + appendChild: jest.fn(), + }, + querySelector: jest.fn().mockReturnValue(null), + }, + writable: true, + configurable: true, +}); + +// Mock Daily.co +const mockDailyCallInstance = { + join: jest.fn().mockResolvedValue(undefined), + destroy: jest.fn().mockResolvedValue(undefined), + iframe: jest.fn().mockReturnValue({ + style: { setProperty: jest.fn() }, + }), + on: jest.fn(), + sendAppMessage: jest.fn(), + setLocalAudio: jest.fn(), + localAudio: jest.fn().mockReturnValue(true), + startRecording: jest.fn(), + stopRecording: jest.fn(), + startRemoteParticipantsAudioLevelObserver: jest.fn(), + updateInputSettings: jest.fn(), + updateParticipant: jest.fn(), + setInputDevicesAsync: jest.fn().mockResolvedValue(undefined), +}; + +jest.mock('@daily-co/daily-js', () => ({ + __esModule: true, + default: { + createCallObject: jest.fn().mockImplementation((options: any) => { + callOrder.push('createCallObject'); + return mockDailyCallInstance; + }), + }, +})); + +// Mock the API client +jest.mock('../client', () => ({ + client: { + baseUrl: 'https://api.vapi.ai', + setSecurityData: jest.fn(), + call: { + callControllerCreateWebCall: jest.fn().mockImplementation(async () => { + callOrder.push('apiCall'); + // Simulate network delay + await new Promise((resolve) => setTimeout(resolve, 50)); + return { + data: { + id: 'test-call-id', + webCallUrl: 'https://test.daily.co/test-room', + artifactPlan: { videoRecordingEnabled: false }, + assistant: { voice: { provider: 'default' } }, + }, + }; + }), + }, + }, +})); + +import Vapi from '../vapi'; +import DailyIframe from '@daily-co/daily-js'; + +describe('Vapi.start() - Early getUserMedia Acquisition (VAP-12773)', () => { + let vapi: Vapi; + + beforeEach(() => { + callOrder = []; + jest.clearAllMocks(); + vapi = new Vapi('test-token'); + }); + + afterEach(async () => { + try { + await vapi.stop(); + } catch { + // Ignore cleanup errors + } + }); + + it('should call getUserMedia BEFORE the API call to create the web call', async () => { + await vapi.start('test-assistant-id'); + + // Verify getUserMedia was called + expect(mockGetUserMedia).toHaveBeenCalled(); + + // Verify the order: getUserMedia must come before the API call + const getUserMediaIndex = callOrder.indexOf('getUserMedia'); + const apiCallIndex = callOrder.indexOf('apiCall'); + + expect(getUserMediaIndex).not.toBe(-1); + expect(apiCallIndex).not.toBe(-1); + expect(getUserMediaIndex).toBeLessThan(apiCallIndex); + }); + + it('should pass the pre-acquired audio track to DailyIframe.createCallObject', async () => { + await vapi.start('test-assistant-id'); + + expect(DailyIframe.createCallObject).toHaveBeenCalledWith( + expect.objectContaining({ + audioSource: mockAudioTrack, + }), + ); + }); + + it('should still work when getUserMedia fails (fallback to default behavior)', async () => { + mockGetUserMedia.mockRejectedValueOnce(new Error('Permission denied')); + + const result = await vapi.start('test-assistant-id'); + + // Should still proceed with the call (DailyIframe handles getUserMedia internally as fallback) + expect(DailyIframe.createCallObject).toHaveBeenCalledWith( + expect.objectContaining({ + audioSource: true, + }), + ); + expect(result).not.toBeNull(); + }); + + it('should request audio-only from getUserMedia (not video)', async () => { + await vapi.start('test-assistant-id'); + + expect(mockGetUserMedia).toHaveBeenCalledWith({ audio: true }); + }); + + it('should stop pre-acquired tracks on cleanup if call creation fails', async () => { + // Add error listener to prevent EventEmitter from throwing on 'error' events + const errorHandler = jest.fn(); + vapi.on('error', errorHandler); + + // Make the API call fail + const { client } = require('../client'); + client.call.callControllerCreateWebCall.mockRejectedValueOnce( + new Error('API Error'), + ); + + const result = await vapi.start('test-assistant-id'); + + // The call should have failed gracefully + expect(result).toBeNull(); + + // The pre-acquired track should be stopped to free the microphone + expect(mockAudioTrack.stop).toHaveBeenCalled(); + + // Clean up + vapi.removeListener('error', errorHandler); + }); + + it('should accept a pre-acquired MediaStream in start options', async () => { + const userProvidedTrack = { + kind: 'audio', + id: 'user-provided-track', + enabled: true, + stop: jest.fn(), + } as unknown as MediaStreamTrack; + + const userProvidedStream = { + getAudioTracks: () => [userProvidedTrack], + getTracks: () => [userProvidedTrack], + } as unknown as MediaStream; + + await vapi.start('test-assistant-id', undefined, undefined, undefined, undefined, { + mediaStream: userProvidedStream, + }); + + // Should NOT call getUserMedia when a stream is provided + expect(mockGetUserMedia).not.toHaveBeenCalled(); + + // Should use the user-provided track + expect(DailyIframe.createCallObject).toHaveBeenCalledWith( + expect.objectContaining({ + audioSource: userProvidedTrack, + }), + ); + }); + + it('should not call getUserMedia when start is called without being in a gesture context but audioSource is already a track', async () => { + // If the user already configured audioSource as a MediaStreamTrack in the constructor, + // we should not call getUserMedia again + const existingTrack = { + kind: 'audio', + id: 'existing-track', + enabled: true, + stop: jest.fn(), + } as unknown as MediaStreamTrack; + + const vapiWithTrack = new Vapi('test-token', undefined, undefined, { + audioSource: existingTrack, + }); + + await vapiWithTrack.start('test-assistant-id'); + + // Should NOT call getUserMedia when audioSource is already a track + expect(mockGetUserMedia).not.toHaveBeenCalled(); + }); +}); diff --git a/vapi.ts b/vapi.ts index e659313ff..0a08bf24a 100644 --- a/vapi.ts +++ b/vapi.ts @@ -193,6 +193,23 @@ type StartCallOptions = { * @example true */ roomDeleteOnUserLeaveEnabled?: boolean; + /** + * A pre-acquired MediaStream to use for the call. When provided, the SDK will + * skip its own `getUserMedia()` call and use this stream's audio track instead. + * + * This is useful when the caller wants to acquire the microphone earlier in the + * user-gesture lifecycle (e.g., in a button click handler) to avoid mobile browser + * "user gesture" timeout issues that can cause `NotAllowedError`. + * + * @example + * ```ts + * const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + * await vapi.start('assistant-id', undefined, undefined, undefined, undefined, { + * mediaStream: stream, + * }); + * ``` + */ + mediaStream?: MediaStream; } type WebCall = { @@ -394,6 +411,75 @@ export default class Vapi extends VapiEventEmitter { this.started = true; + // Determine whether we need to eagerly acquire a media stream. + // On mobile browsers the "user gesture" window is very short (~1-5 s). + // If the audioSource is already a MediaStreamTrack (set in constructor + // options or passed via options.mediaStream) we can skip this step. + let earlyAudioTrack: MediaStreamTrack | null = null; + + const userProvidedStream = options?.mediaStream; + const constructorAudioSource = this.dailyCallObject.audioSource; + const hasExistingTrack = + userProvidedStream || + (constructorAudioSource != null && + typeof constructorAudioSource === 'object' && + 'kind' in (constructorAudioSource as any)); + + if (!hasExistingTrack) { + // Acquire the microphone NOW, while we are still inside the user + // gesture window, before any async network calls. + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'started', + timestamp: new Date().toISOString(), + }); + + const earlyMediaStartTime = Date.now(); + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + earlyAudioTrack = stream.getAudioTracks()[0] ?? null; + + const earlyMediaDuration = Date.now() - earlyMediaStartTime; + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'completed', + duration: earlyMediaDuration, + timestamp: new Date().toISOString(), + metadata: { trackId: earlyAudioTrack?.id }, + }); + } catch (mediaError) { + const earlyMediaDuration = Date.now() - earlyMediaStartTime; + const serializedMediaError = serializeError(mediaError); + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'failed', + duration: earlyMediaDuration, + timestamp: new Date().toISOString(), + metadata: { error: serializedMediaError.message }, + }); + // Non-fatal: fall back to letting Daily.co handle getUserMedia itself. + // This path may still fail on mobile due to the gesture timeout, but + // it preserves backward compatibility on desktop and other environments. + } + } else if (userProvidedStream) { + // Use the caller-provided MediaStream + earlyAudioTrack = userProvidedStream.getAudioTracks()[0] ?? null; + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'completed', + timestamp: new Date().toISOString(), + metadata: { source: 'user-provided', trackId: earlyAudioTrack?.id }, + }); + } else { + this.emit('call-start-progress', { + stage: 'early-media-acquisition', + status: 'completed', + timestamp: new Date().toISOString(), + metadata: { source: 'constructor-audio-source' }, + }); + } + try { // Stage 1: Create web call this.emit('call-start-progress', { @@ -401,9 +487,9 @@ export default class Vapi extends VapiEventEmitter { status: 'started', timestamp: new Date().toISOString() }); - + const webCallStartTime = Date.now(); - + const webCall = ( await client.call.callControllerCreateWebCall({ assistant: typeof assistant === 'string' ? undefined : assistant, @@ -446,24 +532,30 @@ export default class Vapi extends VapiEventEmitter { const isVideoEnabled = webCall?.assistant?.voice?.provider === 'tavus'; + // Determine the audioSource for the Daily call object. + // If we pre-acquired a track, use it so Daily.co does not call + // getUserMedia again (which would fail outside the gesture window). + const resolvedAudioSource: boolean | string | MediaStreamTrack = + earlyAudioTrack ?? this.dailyCallObject.audioSource ?? true; + // Stage 2: Create Daily call object this.emit('call-start-progress', { stage: 'daily-call-object-creation', status: 'started', timestamp: new Date().toISOString(), metadata: { - audioSource: this.dailyCallObject.audioSource ?? true, + audioSource: earlyAudioTrack ? 'pre-acquired-track' : (this.dailyCallObject.audioSource ?? true), videoSource: this.dailyCallObject.videoSource ?? isVideoRecordingEnabled, isVideoRecordingEnabled, isVideoEnabled } }); - + const dailyCallStartTime = Date.now(); - + try { this.call = DailyIframe.createCallObject({ - audioSource: this.dailyCallObject.audioSource ?? true, + audioSource: resolvedAudioSource, videoSource: this.dailyCallObject.videoSource ?? isVideoRecordingEnabled, dailyConfig: this.dailyCallConfig, }); @@ -838,9 +930,18 @@ export default class Vapi extends VapiEventEmitter { return webCall; } catch (e) { + // Stop the pre-acquired audio track to free the microphone + if (earlyAudioTrack) { + try { + earlyAudioTrack.stop(); + } catch { + // Ignore errors stopping the track + } + } + const totalDuration = Date.now() - startTime; const serializedError = serializeError(e); - + this.emit('call-start-failed', { stage: 'unknown', totalDuration, @@ -854,7 +955,7 @@ export default class Vapi extends VapiEventEmitter { isMobile: this.isMobileDevice() } }); - + // Also emit the generic error event for backward compatibility this.emit('error', { type: 'start-method-error', @@ -869,7 +970,7 @@ export default class Vapi extends VapiEventEmitter { isMobile: this.isMobileDevice() } }); - + await this.cleanup(); return null; }