Spaces:
Running
Running
import gradio as gr | |
import torch | |
import torchaudio | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
from huggingface_hub import InferenceClient | |
# Load ASR model | |
asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025" | |
processor = Wav2Vec2Processor.from_pretrained(asr_model_name) | |
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name) | |
# Load text generation client | |
client = InferenceClient("unsloth/gemma-3-1b-it") | |
# Function: Transcribe audio | |
def transcribe(audio_file): | |
waveform, sample_rate = torchaudio.load(audio_file) | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
waveform = resampler(waveform).squeeze().numpy() | |
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt") | |
with torch.no_grad(): | |
logits = asr_model(inputs.input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0] | |
return transcription | |
# Function: Generate response based on transcription | |
def generate_text(prompt): | |
response = client.text_generation(prompt, max_new_tokens=150, temperature=0.7) | |
return response.strip() | |
# Gradio interface | |
def asr_and_generate(audio): | |
if not audio: | |
return "No audio provided.", "" | |
transcription = transcribe(audio) | |
generated = generate_text(transcription) | |
return transcription, generated | |
demo = gr.Interface( | |
fn=asr_and_generate, | |
inputs=gr.Audio(label="Upload or Record Audio", type="filepath"), | |
outputs=[ | |
gr.Textbox(label="Transcription"), | |
gr.Textbox(label="AI Response") | |
], | |
title="ASR to Text Generation", | |
description="Upload audio. The model will transcribe speech to text and generate a response using a fine-tuned text model." | |
) | |
demo.launch() |