Combination / app.py
Futuresony's picture
Update app.py
b14c547 verified
raw
history blame
1.79 kB
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from huggingface_hub import InferenceClient
# Load ASR model
asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
# Load text generation client
client = InferenceClient("unsloth/gemma-3-1b-it")
# Function: Transcribe audio
def transcribe(audio_file):
waveform, sample_rate = torchaudio.load(audio_file)
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform).squeeze().numpy()
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = asr_model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Function: Generate response based on transcription
def generate_text(prompt):
response = client.text_generation(prompt, max_new_tokens=150, temperature=0.7)
return response.strip()
# Gradio interface
def asr_and_generate(audio):
if not audio:
return "No audio provided.", ""
transcription = transcribe(audio)
generated = generate_text(transcription)
return transcription, generated
demo = gr.Interface(
fn=asr_and_generate,
inputs=gr.Audio(label="Upload or Record Audio", type="filepath"),
outputs=[
gr.Textbox(label="Transcription"),
gr.Textbox(label="AI Response")
],
title="ASR to Text Generation",
description="Upload audio. The model will transcribe speech to text and generate a response using a fine-tuned text model."
)
demo.launch()