Spaces:
Sleeping
Sleeping
File size: 1,792 Bytes
eb87835 b14c547 eb87835 b14c547 eb87835 b14c547 eb87835 b14c547 eb87835 b14c547 eb87835 b14c547 eb87835 b14c547 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from huggingface_hub import InferenceClient
# Load ASR model
asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
# Load text generation client
client = InferenceClient("unsloth/gemma-3-1b-it")
# Function: Transcribe audio
def transcribe(audio_file):
waveform, sample_rate = torchaudio.load(audio_file)
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform).squeeze().numpy()
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = asr_model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Function: Generate response based on transcription
def generate_text(prompt):
response = client.text_generation(prompt, max_new_tokens=150, temperature=0.7)
return response.strip()
# Gradio interface
def asr_and_generate(audio):
if not audio:
return "No audio provided.", ""
transcription = transcribe(audio)
generated = generate_text(transcription)
return transcription, generated
demo = gr.Interface(
fn=asr_and_generate,
inputs=gr.Audio(label="Upload or Record Audio", type="filepath"),
outputs=[
gr.Textbox(label="Transcription"),
gr.Textbox(label="AI Response")
],
title="ASR to Text Generation",
description="Upload audio. The model will transcribe speech to text and generate a response using a fine-tuned text model."
)
demo.launch() |