export class VoiceLiveClient { private ws: WebSocket | null = null; private audioContext: AudioContext | null = null; private mediaStream: MediaStream | null = null; private processor: ScriptProcessorNode | null = null; private isRecording = false; private sessionActive = false; // Audio playback management private audioQueue: AudioBuffer[] = []; private currentAudioSource: AudioBufferSourceNode | null = null; private isPlayingAudio = false; // Callbacks private onUserTextCallback?: (text: string) => void; private onModelTextCallback?: (text: string) => void; private onAudioCallback?: (audioBlob: Blob) => void; private onErrorCallback?: (error: string) => void; private onStatusCallback?: (status: string) => void; // Configuration private wsUrl: string; private token: string; private agentId?: string; private threadId?: string; private projectName?: string; constructor(wsUrl: string, token: string, agentId?: string, threadId?: string, projectName?: string) { this.wsUrl = wsUrl; this.token = token; this.agentId = agentId; this.threadId = threadId; this.projectName = projectName; } // Callback setters onUserText(callback: (text: string) => void) { this.onUserTextCallback = callback; } onModelText(callback: (text: string) => void) { this.onModelTextCallback = callback; } onAudioResponse(callback: (audioBlob: Blob) => void) { this.onAudioCallback = callback; } onError(callback: (error: string) => void) { this.onErrorCallback = callback; } onStatus(callback: (status: string) => void) { this.onStatusCallback = callback; } async startSession(): Promise { return new Promise((resolve, reject) => { try { // Use the correct endpoint path for AI Foundry agents const url = new URL(this.wsUrl.replace('/voice-live/realtime', '/voice-agent/realtime')); url.searchParams.set('api-version', '2025-05-01-preview'); // Add client request ID like AI Foundry does const clientRequestId = crypto.randomUUID(); url.searchParams.set('x-ms-client-request-id', clientRequestId); console.log(this.agentId, this.projectName); // For agent integration, add agent parameters to URL if (this.agentId && this.projectName) { url.searchParams.set('agent_id', this.agentId); // Note: underscore, not hyphen url.searchParams.set('agent-project-name', this.projectName); url.searchParams.set('agent_access_token', this.token); console.log('Using agent integration with:', { agentId: this.agentId, projectName: this.projectName, tokenLength: this.token.length, clientRequestId }); } else { // Fallback to direct model access if no agent info url.searchParams.set('model', 'gpt-4o'); url.searchParams.set('authorization', `Bearer ${this.token}`); console.log('Using direct model access'); } console.log('WebSocket URL (sanitized):', url.toString() .replace(/agent_access_token=[^&]*/g, 'agent_access_token=***') .replace(/authorization=Bearer%20[^&]*/g, 'authorization=Bearer***')); this.ws = new WebSocket(url.toString()); this.ws.onopen = () => { console.log('WebSocket connected successfully'); this.sessionActive = true; this.onStatusCallback?.('connected'); this.sendSessionUpdate(); resolve(); }; this.ws.onmessage = (event) => { try { const data = JSON.parse(event.data); this.handleServerMessage(data); } catch (error) { console.error('Error parsing WebSocket message:', error); } }; this.ws.onerror = (error) => { console.error('WebSocket error:', error); this.onErrorCallback?.('WebSocket connection error'); reject(error); }; this.ws.onclose = (event) => { console.log('WebSocket closed:', event.code, event.reason); this.sessionActive = false; this.onStatusCallback?.('disconnected'); this.cleanup(); if (event.code !== 1000) { // 1000 is normal closure reject(new Error(`WebSocket closed unexpectedly: ${event.code} ${event.reason}`)); } }; } catch (error) { console.error('Error in startSession:', error); this.onErrorCallback?.('Failed to start session'); reject(error); } }); } private sendSessionUpdate() { if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return; const sessionConfig: any = { type: 'session.update', session: { modalities: ['text', 'audio'], instructions: this.agentId ? `You are an AI assistant with access to uploaded knowledge and tools. Use your knowledge base to provide accurate, helpful responses based on the available information. You have access to agent context and should respond as if you are the configured agent.` : 'You are a helpful AI assistant. Respond naturally and conversationally to user questions.', voice: { name: 'en-US-AriaNeural', type: 'azure-standard' }, turn_detection: { type: 'server_vad', threshold: 0.5, prefix_padding_ms: 300, silence_duration_ms: 500 }, input_audio_format: 'pcm16', output_audio_format: 'pcm16', input_audio_transcription: { model: 'whisper-1', language: 'en' }, input_audio_noise_reduction: { type: 'azure_deep_noise_suppression' }, input_audio_echo_cancellation: { type: 'server_echo_cancellation' } } }; // Note: agent and thread parameters are not supported directly in session config // Agent context should be handled through the WebSocket connection parameters or instructions console.log('Sending session config:', JSON.stringify(sessionConfig, null, 2)); this.ws.send(JSON.stringify(sessionConfig)); } private handleServerMessage(data: any) { console.log('Received server message:', data.type, data); switch (data.type) { case 'session.updated': this.onStatusCallback?.('session_ready'); console.log('Session updated successfully'); break; case 'conversation.item.input_audio_transcription.completed': if (data.transcript && this.onUserTextCallback) { this.onUserTextCallback(data.transcript.trim()); } break; case 'response.text.delta': if (data.delta && this.onModelTextCallback) { this.onModelTextCallback(data.delta); } break; case 'response.audio.delta': if (data.delta) { this.playAudioDelta(data.delta); } break; case 'response.done': this.onStatusCallback?.('response_complete'); break; case 'error': console.error('Server error:', data.error); this.onErrorCallback?.(data.error?.message || 'Server error'); break; case 'input_audio_buffer.speech_started': this.stopAudioPlayback(); this.onStatusCallback?.('speech_started'); break; case 'input_audio_buffer.speech_stopped': this.onStatusCallback?.('speech_stopped'); break; case 'response.audio_transcript.delta': if (data.delta && this.onModelTextCallback) { this.onModelTextCallback(data.delta); } break; default: console.log('Unhandled message type:', data.type); } } private playAudioDelta(base64Audio: string) { try { const audioData = atob(base64Audio); const pcmArray = new Int16Array(audioData.length / 2); for (let i = 0; i < pcmArray.length; i++) { pcmArray[i] = audioData.charCodeAt(i * 2) | (audioData.charCodeAt(i * 2 + 1) << 8); } this.queueAudioForPlayback(pcmArray); } catch (error) { console.error('Error processing audio:', error); } } private async queueAudioForPlayback(pcmData: Int16Array) { if (!this.audioContext) { await this.initializeAudioContext(); } try { const audioBuffer = this.audioContext!.createBuffer(1, pcmData.length, 24000); const channelData = audioBuffer.getChannelData(0); for (let i = 0; i < pcmData.length; i++) { channelData[i] = pcmData[i] / 32768.0; } this.audioQueue.push(audioBuffer); if (!this.isPlayingAudio) { this.playNextAudioChunk(); } } catch (error) { console.error('Error queuing audio:', error); } } private async initializeAudioContext() { try { this.audioContext = new AudioContext({ sampleRate: 24000 }); if (this.audioContext.state === 'suspended') { await this.audioContext.resume(); } } catch (error) { console.error('Error initializing audio context:', error); } } private playNextAudioChunk() { if (this.audioQueue.length === 0 || !this.audioContext) { this.isPlayingAudio = false; return; } this.isPlayingAudio = true; const audioBuffer = this.audioQueue.shift()!; this.currentAudioSource = this.audioContext.createBufferSource(); this.currentAudioSource.buffer = audioBuffer; this.currentAudioSource.connect(this.audioContext.destination); this.currentAudioSource.onended = () => { this.playNextAudioChunk(); }; this.currentAudioSource.start(0); } private stopAudioPlayback() { if (this.currentAudioSource) { try { this.currentAudioSource.stop(); this.currentAudioSource.disconnect(); } catch (e) {} this.currentAudioSource = null; } this.isPlayingAudio = false; this.audioQueue = []; } async startRecording(): Promise { if (this.isRecording) return; this.stopAudioPlayback(); try { this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: 24000, channelCount: 1, echoCancellation: true, noiseSuppression: true } }); const recordingContext = new AudioContext({ sampleRate: 24000 }); await recordingContext.resume(); const source = recordingContext.createMediaStreamSource(this.mediaStream); const bufferSize = 1024; this.processor = recordingContext.createScriptProcessor(bufferSize, 1, 1); this.processor.onaudioprocess = (event) => { if (!this.isRecording) return; const inputData = event.inputBuffer.getChannelData(0); const pcmData = this.float32ToPCM16(inputData); this.sendAudioChunk(pcmData.buffer); }; source.connect(this.processor); this.processor.connect(recordingContext.destination); this.isRecording = true; this.onStatusCallback?.('recording'); } catch (error) { this.onErrorCallback?.('Microphone access denied'); throw error; } } private float32ToPCM16(float32Array: Float32Array): Int16Array { const pcm16Array = new Int16Array(float32Array.length); for (let i = 0; i < float32Array.length; i++) { pcm16Array[i] = Math.max(-32768, Math.min(32767, float32Array[i] * 32767)); } return pcm16Array; } stopRecording() { if (!this.isRecording) return; this.isRecording = false; if (this.processor) { this.processor.disconnect(); this.processor = null; } if (this.mediaStream) { this.mediaStream.getTracks().forEach(track => track.stop()); this.mediaStream = null; } this.endInput(); this.onStatusCallback?.('stopped_recording'); } private sendAudioChunk(audioBuffer: ArrayBuffer) { if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return; try { const uint8Array = new Uint8Array(audioBuffer); const base64Audio = btoa(String.fromCharCode(...uint8Array)); this.ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64Audio })); } catch (error) { console.error('Error sending audio:', error); } } private endInput() { if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return; this.ws.send(JSON.stringify({ type: 'input_audio_buffer.commit' })); } sendTextMessage(text: string) { if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return; const messages = [ { type: 'conversation.item.create', item: { type: 'message', role: 'user', content: [{ type: 'input_text', text }] } }, { type: 'response.create' } ]; messages.forEach(msg => this.ws!.send(JSON.stringify(msg))); } disconnect() { this.sessionActive = false; this.stopRecording(); this.stopAudioPlayback(); if (this.ws) { this.ws.close(); this.ws = null; } this.cleanup(); } private cleanup() { if (this.mediaStream) { this.mediaStream.getTracks().forEach(track => track.stop()); this.mediaStream = null; } if (this.audioContext) { this.audioContext.close(); this.audioContext = null; } } } @app.get("/get-voice-token") async def get_voice_token(): """Get authentication token, agent ID, and create thread for Voice Live API""" # try: # credential = DefaultAzureCredential() # token = credential.get_token("https://cognitiveservices.azure.com/.default") # # Get the first agent ID from environment variables # agent_id = AGENT_IDS[0] if AGENT_IDS else None # # Create a new thread for this voice session if agent_id exists # thread_id = None # if agent_id: # try: # client = AgentsClient(credential=credential, endpoint=ENDPOINT) # thread = client.threads.create() # thread_id = thread.id # # Store the thread for this session # session_key = f"voice_{time.time()}" # active_voice_threads[session_key] = { # 'thread_id': thread_id, # 'agent_id': agent_id, # 'created_at': time.time() # } # logging.info(f"Created thread {thread_id} for agent {agent_id}") # except Exception as e: # logging.error(f"Failed to create thread for agent: {e}") # # Continue without thread if creation fails # return JSONResponse(content={ # "token": token.token, # "expires_on": token.expires_on, # "agent_id": agent_id, # "thread_id": thread_id # Return thread ID for voice session # }) # except Exception as e: # logging.exception("Token fetch failed") # raise HTTPException(status_code=500, detail=f"Token fetch failed: {str(e)}") try: credential = DefaultAzureCredential() # Use the correct scope for AI Foundry agents - this is crucial! token = credential.get_token("https://ai.azure.com/.default") agent_id = AGENT_IDS[0] if AGENT_IDS else None if not agent_id: raise HTTPException(status_code=400, detail="No agent ID configured") client = AgentsClient(credential=credential, endpoint=ENDPOINT) # Create thread for the agent context thread = client.threads.create() thread_id = thread.id logging.info(f"Created thread {thread_id} for agent {agent_id}") return JSONResponse(content={ "token": token.token, "expires_on": token.expires_on, "agent_id": agent_id, "thread_id": thread_id, "project_name": PROJECT_NAME, "token_audience": "https://ai.azure.com" # For debugging }) except Exception as e: logging.exception("Failed to create voice session") raise HTTPException(status_code=500, detail=f"Voice session setup failed: {str(e)}")