import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from huggingface_hub import InferenceClient

# Load ASR model
asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)

# Load text generation client
client = InferenceClient("unsloth/gemma-3-1b-it")

# Function: Transcribe audio
def transcribe(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform).squeeze().numpy()
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    with torch.no_grad():
        logits = asr_model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# Function: Generate response based on transcription
def generate_text(prompt):
    response = client.text_generation(prompt, max_new_tokens=150, temperature=0.7)
    return response.strip()

# Gradio interface
def asr_and_generate(audio):
    if not audio:
        return "No audio provided.", ""
    transcription = transcribe(audio)
    generated = generate_text(transcription)
    return transcription, generated

demo = gr.Interface(
    fn=asr_and_generate,
    inputs=gr.Audio(label="Upload or Record Audio", type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="AI Response")
    ],
    title="ASR to Text Generation",
    description="Upload audio. The model will transcribe speech to text and generate a response using a fine-tuned text model."
)

demo.launch()