import gradio as gr import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from huggingface_hub import InferenceClient # Load ASR model asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025" processor = Wav2Vec2Processor.from_pretrained(asr_model_name) asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name) # Load text generation client client = InferenceClient("unsloth/gemma-3-1b-it") # Function: Transcribe audio def transcribe(audio_file): waveform, sample_rate = torchaudio.load(audio_file) resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform).squeeze().numpy() inputs = processor(waveform, sampling_rate=16000, return_tensors="pt") with torch.no_grad(): logits = asr_model(inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription # Function: Generate response based on transcription def generate_text(prompt): response = client.text_generation(prompt, max_new_tokens=150, temperature=0.7) return response.strip() # Gradio interface def asr_and_generate(audio): if not audio: return "No audio provided.", "" transcription = transcribe(audio) generated = generate_text(transcription) return transcription, generated demo = gr.Interface( fn=asr_and_generate, inputs=gr.Audio(label="Upload or Record Audio", type="filepath"), outputs=[ gr.Textbox(label="Transcription"), gr.Textbox(label="AI Response") ], title="ASR to Text Generation", description="Upload audio. The model will transcribe speech to text and generate a response using a fine-tuned text model." ) demo.launch()