# app.py
import os
import tempfile
import traceback
from dataclasses import dataclass, field
from typing import Any, List, Tuple, Optional

import gradio as gr
import numpy as np
import soundfile as sf
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from gradio_client import Client
from ttsmms import download, TTS
from langdetect import detect

# ========================
# CONFIG - update as needed
# ========================
# Local ASR model (change to correct HF repo id or local path)
asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"

# Remote LLM Gradio Space
llm_space = "Futuresony/Mr.Events"
llm_api_name = "/chat"

# TTS languages
sw_lang_code = "swh"  # ttsmms language code for Swahili (adjust if needed)
en_lang_code = "eng"

# ========================
# LOAD MODELS / CLIENTS
# ========================
print("[INIT] Loading ASR processor & model...")
processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
asr_model.eval()

print("[INIT] Creating Gradio Client for LLM Space...")
llm_client = Client(llm_space)

print("[INIT] Downloading TTS models (this may take time)")
swahili_dir = download(sw_lang_code, "./data/swahili")
english_dir = download(en_lang_code, "./data/english")
swahili_tts = TTS(swahili_dir)
english_tts = TTS(english_dir)

# ========================
# APP STATE
# ========================
@dataclass
class AppState:
    conversation: List[dict] = field(default_factory=list)
    last_transcription: Optional[str] = None
    last_reply: Optional[str] = None
    last_wav: Optional[str] = None

# ========================
# UTIL: Safe LLM call
# ========================
def safe_predict(prompt: str, api_name: str = llm_api_name, timeout: int = 30) -> str:
    """
    Calls gradio_client.Client.predict() but defends against:
    - gradio_client JSON schema parsing errors
    - endpoints returning bool/list/tuple/dict
    - other exceptions
    Always returns a string (never bool or non-iterable).
    """
    try:
        result = llm_client.predict(query=prompt, api_name=api_name)
        print(f"[LLM] raw result: {repr(result)} (type={type(result)})")
    except Exception as e:
        # If gradio_client fails (schema issues etc.), catch and return an error message
        print("[LLM] predict() raised an exception:")
        traceback.print_exc()
        return f"Error: could not contact LLM endpoint ({str(e)})"

    # Convert whatever we got into a string safely
    if isinstance(result, str):
        return result.strip()
    if isinstance(result, (list, tuple)):
        try:
            return " ".join(map(str, result)).strip()
        except Exception:
            return str(result)
    # For bool/dict/None/other -> stringify
    try:
        return str(result).strip()
    except Exception as e:
        print("[LLM] Failed to stringify result:", e)
        return "Error: LLM returned an unsupported type."

# ========================
# ASR (Wav2Vec2) helpers
# ========================
def write_temp_wav_from_gr_numpy(audio_tuple: Tuple[np.ndarray, int]) -> str:
    """
    Gradio audio (type='numpy') yields (np_array, sample_rate).
    np_array shape: (n_samples, n_channels) or (n_samples,)
    We'll write to a temporary WAV file using soundfile, and return path.
    """
    array, sr = audio_tuple
    if array is None:
        raise ValueError("Empty audio")
    # If stereo, convert to mono by averaging channels
    if array.ndim == 2:
        array = np.mean(array, axis=1)
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tmp_name = tmp.name
    tmp.close()
    sf.write(tmp_name, array, sr)
    return tmp_name

def transcribe_wav_file(wav_path: str) -> str:
    """Load with torchaudio (for resampling if needed), then transcribe."""
    waveform, sr = torchaudio.load(wav_path)  # waveform: (channels, samples)
    # convert to mono
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    waveform = waveform.squeeze(0).numpy()
    # resample if necessary
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(torch.from_numpy(waveform)).numpy()
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = asr_model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# ========================
# TTS helper
# ========================
def synthesize_text_to_wav(text: str) -> Optional[str]:
    """Detect language and synthesize to ./output.wav (overwrites each call)."""
    if not text:
        return None
    try:
        lang = detect(text)
    except Exception:
        lang = "en"
    wav_path = "./output.wav"
    try:
        if lang and lang.startswith("sw"):
            swahili_tts.synthesis(text, wav_path=wav_path)
        else:
            english_tts.synthesis(text, wav_path=wav_path)
        return wav_path
    except Exception as e:
        print("[TTS] synthesis failed:", e)
        traceback.print_exc()
        return None

# ========================
# GRPC/HTTP flow functions (for Gradio event hooks)
# ========================
def process_audio_start(audio: Tuple[np.ndarray, int], state: AppState):
    """
    Called when recording starts/stops depending on how you wire events.
    We'll transcribe the incoming audio and append the user message to conversation.
    Returns updated state and the latest transcription (so UI can show it).
    """
    try:
        if audio is None:
            return state, ""
        wav = write_temp_wav_from_gr_numpy(audio)
        transcription = transcribe_wav_file(wav)
        print(f"[ASR] transcription: {transcription!r}")
        state.last_transcription = transcription
        # append user message for context
        state.conversation.append({"role": "user", "content": transcription})
        # cleanup temp wav
        try:
            os.remove(wav)
        except Exception:
            pass
        return state, transcription
    except Exception as e:
        print("[ASR] error:", e)
        traceback.print_exc()
        return state, f"Error in transcription: {str(e)}"

def generate_reply_stop(state: AppState):
    """
    Called after transcription is present in state (i.e. on stop_recording).
    Generates a reply with safe_predict, appends to conversation, synthesizes TTS,
    and returns updated state, the chat history (for Chatbot), and the output wav path.
    """
    try:
        # Build messages for the LLM from state.conversation
        # (prefix with system prompt for diet calorie assistant as earlier)
        system_prompt = (
            "In conversation with the user, ask questions to estimate and provide (1) total calories, "
            "(2) protein, carbs, and fat in grams, (3) fiber and sugar content. Only ask one question at a time. "
            "Be conversational and natural."
        )
        messages = [ {"role": "system", "content": system_prompt} ] + state.conversation

        # Convert messages to a single text prompt for the remote space, if your remote space expects `query` plain text.
        # If your remote space accepts structured messages, adapt accordingly.
        # We'll join messages into a single friendly prompt (safe fallback).
        prompt_text = ""
        for m in messages:
            role = m.get("role", "user")
            content = m.get("content", "")
            prompt_text += f"[{role}] {content}\n"

        reply_text = safe_predict(prompt_text, api_name=llm_api_name)
        print("[LLM] reply:", reply_text)

        # Add assistant reply to conversation
        state.conversation.append({"role": "assistant", "content": reply_text})
        state.last_reply = reply_text

        # Synthesize to wav (TTS)
        wav_path = synthesize_text_to_wav(reply_text)
        state.last_wav = wav_path

        # Build chatbot history for gr.Chatbot (list of tuples (user, bot) or messages)
        # gr.Chatbot expects list of (user_msg, bot_msg) pairs; we'll convert conversation
        # into that form:
        pairs = []
        # collapse conversation into pairs
        user_msgs = []
        bot_msgs = []
        # simple converter: walk conversation and pair each user with next assistant
        conv = state.conversation
        i = 0
        while i < len(conv):
            if conv[i]["role"] == "user":
                user = conv[i]["content"]
                # look ahead for assistant
                assistant = ""
                if i + 1 < len(conv) and conv[i+1]["role"] == "assistant":
                    assistant = conv[i+1]["content"]
                    i += 1
                pairs.append((user, assistant))
            i += 1

        return state, pairs, wav_path
    except Exception as e:
        print("[LLM/TTS] error:", e)
        traceback.print_exc()
        return state, [("error", f"Error generating reply: {str(e)}")], None

# ========================
# CLIENT-SIDE VAD JS (embedded)
# ========================
custom_js = r"""
async function main() {
  // Load ONNX runtime and VAD library dynamically
  const script1 = document.createElement("script");
  script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
  document.head.appendChild(script1);

  const script2 = document.createElement("script");
  script2.onload = async () =>  {
    console.log("VAD loaded");
    var record = document.querySelector('.record-button');
    if (record) record.textContent = "Just Start Talking!";
    // create MicVAD and auto click the record/stop buttons
    try {
      const myvad = await vad.MicVAD.new({
        onSpeechStart: () => {
          var record = document.querySelector('.record-button');
          var player = document.querySelector('#streaming-out');
          if (record && (!player || player.paused)) {
            record.click();
          }
        },
        onSpeechEnd: () => {
          var stop = document.querySelector('.stop-button');
          if (stop) stop.click();
        }
      });
      myvad.start();
    } catch (e) {
      console.warn("VAD init failed:", e);
    }
  };
  script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.7/dist/bundle.min.js";
  document.head.appendChild(script2);
}
main();
"""

# ========================
# BUILD GRADIO UI
# ========================
with gr.Blocks(js=custom_js, title="ASR → LLM → TTS (Safe)") as demo:
    gr.Markdown("## Speak: ASR → LLM → TTS (defensive, production-friendly)")

    state = gr.State(AppState())

    with gr.Row():
        input_audio = gr.Audio(
            label="🎙 Speak (microphone)",
            source="microphone",
            type="numpy",
            streaming=False,
            show_label=True,
        )

    with gr.Row():
        transcription_out = gr.Textbox(label="Transcription", interactive=False)
    with gr.Row():
        chatbot = gr.Chatbot(label="Conversation")
    with gr.Row():
        output_audio = gr.Audio(label="Assistant speech (TTS)", type="filepath")

    # Wire events:
    # Start/stop hooks - keep defensive returns and mapping to outputs.
    # When recording stops, we'll transcribe (process_audio_start) and update transcription.
    input_audio.start_recording(
        fn=process_audio_start,
        inputs=[input_audio, state],
        outputs=[state, transcription_out],
    )

    # When recording stops, generate reply (LLM + TTS) and update chatbot and output audio
    input_audio.stop_recording(
        fn=generate_reply_stop,
        inputs=[state],
        outputs=[state, chatbot, output_audio],
    )

    # Also add a manual "Generate reply" button if user wants to trigger from existing transcription
    gen_btn = gr.Button("Generate reply (manual)")
    gen_btn.click(fn=generate_reply_stop, inputs=[state], outputs=[state, chatbot, output_audio])

# ========================
# LAUNCH
# ========================
if __name__ == "__main__":
    demo.launch()