# app.py import os import tempfile import traceback from dataclasses import dataclass, field from typing import Any, List, Tuple, Optional import gradio as gr import numpy as np import soundfile as sf import torchaudio import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from gradio_client import Client from ttsmms import download, TTS from langdetect import detect # ======================== # CONFIG - update as needed # ======================== # Local ASR model (change to correct HF repo id or local path) asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025" # Remote LLM Gradio Space llm_space = "Futuresony/Mr.Events" llm_api_name = "/chat" # TTS languages sw_lang_code = "swh" # ttsmms language code for Swahili (adjust if needed) en_lang_code = "eng" # ======================== # LOAD MODELS / CLIENTS # ======================== print("[INIT] Loading ASR processor & model...") processor = Wav2Vec2Processor.from_pretrained(asr_model_name) asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name) asr_model.eval() print("[INIT] Creating Gradio Client for LLM Space...") llm_client = Client(llm_space) print("[INIT] Downloading TTS models (this may take time)") swahili_dir = download(sw_lang_code, "./data/swahili") english_dir = download(en_lang_code, "./data/english") swahili_tts = TTS(swahili_dir) english_tts = TTS(english_dir) # ======================== # APP STATE # ======================== @dataclass class AppState: conversation: List[dict] = field(default_factory=list) last_transcription: Optional[str] = None last_reply: Optional[str] = None last_wav: Optional[str] = None # ======================== # UTIL: Safe LLM call # ======================== def safe_predict(prompt: str, api_name: str = llm_api_name, timeout: int = 30) -> str: """ Calls gradio_client.Client.predict() but defends against: - gradio_client JSON schema parsing errors - endpoints returning bool/list/tuple/dict - other exceptions Always returns a string (never bool or non-iterable). """ try: result = llm_client.predict(query=prompt, api_name=api_name) print(f"[LLM] raw result: {repr(result)} (type={type(result)})") except Exception as e: # If gradio_client fails (schema issues etc.), catch and return an error message print("[LLM] predict() raised an exception:") traceback.print_exc() return f"Error: could not contact LLM endpoint ({str(e)})" # Convert whatever we got into a string safely if isinstance(result, str): return result.strip() if isinstance(result, (list, tuple)): try: return " ".join(map(str, result)).strip() except Exception: return str(result) # For bool/dict/None/other -> stringify try: return str(result).strip() except Exception as e: print("[LLM] Failed to stringify result:", e) return "Error: LLM returned an unsupported type." # ======================== # ASR (Wav2Vec2) helpers # ======================== def write_temp_wav_from_gr_numpy(audio_tuple: Tuple[np.ndarray, int]) -> str: """ Gradio audio (type='numpy') yields (np_array, sample_rate). np_array shape: (n_samples, n_channels) or (n_samples,) We'll write to a temporary WAV file using soundfile, and return path. """ array, sr = audio_tuple if array is None: raise ValueError("Empty audio") # If stereo, convert to mono by averaging channels if array.ndim == 2: array = np.mean(array, axis=1) tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tmp_name = tmp.name tmp.close() sf.write(tmp_name, array, sr) return tmp_name def transcribe_wav_file(wav_path: str) -> str: """Load with torchaudio (for resampling if needed), then transcribe.""" waveform, sr = torchaudio.load(wav_path) # waveform: (channels, samples) # convert to mono if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) waveform = waveform.squeeze(0).numpy() # resample if necessary if sr != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) waveform = resampler(torch.from_numpy(waveform)).numpy() inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = asr_model(inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription # ======================== # TTS helper # ======================== def synthesize_text_to_wav(text: str) -> Optional[str]: """Detect language and synthesize to ./output.wav (overwrites each call).""" if not text: return None try: lang = detect(text) except Exception: lang = "en" wav_path = "./output.wav" try: if lang and lang.startswith("sw"): swahili_tts.synthesis(text, wav_path=wav_path) else: english_tts.synthesis(text, wav_path=wav_path) return wav_path except Exception as e: print("[TTS] synthesis failed:", e) traceback.print_exc() return None # ======================== # GRPC/HTTP flow functions (for Gradio event hooks) # ======================== def process_audio_start(audio: Tuple[np.ndarray, int], state: AppState): """ Called when recording starts/stops depending on how you wire events. We'll transcribe the incoming audio and append the user message to conversation. Returns updated state and the latest transcription (so UI can show it). """ try: if audio is None: return state, "" wav = write_temp_wav_from_gr_numpy(audio) transcription = transcribe_wav_file(wav) print(f"[ASR] transcription: {transcription!r}") state.last_transcription = transcription # append user message for context state.conversation.append({"role": "user", "content": transcription}) # cleanup temp wav try: os.remove(wav) except Exception: pass return state, transcription except Exception as e: print("[ASR] error:", e) traceback.print_exc() return state, f"Error in transcription: {str(e)}" def generate_reply_stop(state: AppState): """ Called after transcription is present in state (i.e. on stop_recording). Generates a reply with safe_predict, appends to conversation, synthesizes TTS, and returns updated state, the chat history (for Chatbot), and the output wav path. """ try: # Build messages for the LLM from state.conversation # (prefix with system prompt for diet calorie assistant as earlier) system_prompt = ( "In conversation with the user, ask questions to estimate and provide (1) total calories, " "(2) protein, carbs, and fat in grams, (3) fiber and sugar content. Only ask one question at a time. " "Be conversational and natural." ) messages = [ {"role": "system", "content": system_prompt} ] + state.conversation # Convert messages to a single text prompt for the remote space, if your remote space expects `query` plain text. # If your remote space accepts structured messages, adapt accordingly. # We'll join messages into a single friendly prompt (safe fallback). prompt_text = "" for m in messages: role = m.get("role", "user") content = m.get("content", "") prompt_text += f"[{role}] {content}\n" reply_text = safe_predict(prompt_text, api_name=llm_api_name) print("[LLM] reply:", reply_text) # Add assistant reply to conversation state.conversation.append({"role": "assistant", "content": reply_text}) state.last_reply = reply_text # Synthesize to wav (TTS) wav_path = synthesize_text_to_wav(reply_text) state.last_wav = wav_path # Build chatbot history for gr.Chatbot (list of tuples (user, bot) or messages) # gr.Chatbot expects list of (user_msg, bot_msg) pairs; we'll convert conversation # into that form: pairs = [] # collapse conversation into pairs user_msgs = [] bot_msgs = [] # simple converter: walk conversation and pair each user with next assistant conv = state.conversation i = 0 while i < len(conv): if conv[i]["role"] == "user": user = conv[i]["content"] # look ahead for assistant assistant = "" if i + 1 < len(conv) and conv[i+1]["role"] == "assistant": assistant = conv[i+1]["content"] i += 1 pairs.append((user, assistant)) i += 1 return state, pairs, wav_path except Exception as e: print("[LLM/TTS] error:", e) traceback.print_exc() return state, [("error", f"Error generating reply: {str(e)}")], None # ======================== # CLIENT-SIDE VAD JS (embedded) # ======================== custom_js = r""" async function main() { // Load ONNX runtime and VAD library dynamically const script1 = document.createElement("script"); script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"; document.head.appendChild(script1); const script2 = document.createElement("script"); script2.onload = async () => { console.log("VAD loaded"); var record = document.querySelector('.record-button'); if (record) record.textContent = "Just Start Talking!"; // create MicVAD and auto click the record/stop buttons try { const myvad = await vad.MicVAD.new({ onSpeechStart: () => { var record = document.querySelector('.record-button'); var player = document.querySelector('#streaming-out'); if (record && (!player || player.paused)) { record.click(); } }, onSpeechEnd: () => { var stop = document.querySelector('.stop-button'); if (stop) stop.click(); } }); myvad.start(); } catch (e) { console.warn("VAD init failed:", e); } }; script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.7/dist/bundle.min.js"; document.head.appendChild(script2); } main(); """ # ======================== # BUILD GRADIO UI # ======================== with gr.Blocks(js=custom_js, title="ASR → LLM → TTS (Safe)") as demo: gr.Markdown("## Speak: ASR → LLM → TTS (defensive, production-friendly)") state = gr.State(AppState()) with gr.Row(): input_audio = gr.Audio( label="🎙 Speak (microphone)", source="microphone", type="numpy", streaming=False, show_label=True, ) with gr.Row(): transcription_out = gr.Textbox(label="Transcription", interactive=False) with gr.Row(): chatbot = gr.Chatbot(label="Conversation") with gr.Row(): output_audio = gr.Audio(label="Assistant speech (TTS)", type="filepath") # Wire events: # Start/stop hooks - keep defensive returns and mapping to outputs. # When recording stops, we'll transcribe (process_audio_start) and update transcription. input_audio.start_recording( fn=process_audio_start, inputs=[input_audio, state], outputs=[state, transcription_out], ) # When recording stops, generate reply (LLM + TTS) and update chatbot and output audio input_audio.stop_recording( fn=generate_reply_stop, inputs=[state], outputs=[state, chatbot, output_audio], ) # Also add a manual "Generate reply" button if user wants to trigger from existing transcription gen_btn = gr.Button("Generate reply (manual)") gen_btn.click(fn=generate_reply_stop, inputs=[state], outputs=[state, chatbot, output_audio]) # ======================== # LAUNCH # ======================== if __name__ == "__main__": demo.launch()