ASRWhisper / app.py
ashpikachu2k1's picture
Create app.py
3568733 verified
raw
history blame
2.05 kB
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
# Load Whisper Odia ASR model (fine-tuned)
asr_model = WhisperForConditionalGeneration.from_pretrained("ashutoshpattnaik50/whisper-small-odia-finetuned")
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
# Load IndicTrans2 Odia-to-English model
translator = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en")
translator_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en")
# Transcription + Translation function
def transcribe_and_translate(audio_path):
# Load and preprocess audio
from datasets import load_dataset, Audio
import numpy as np
import torchaudio
speech_array, sampling_rate = torchaudio.load(audio_path)
if sampling_rate != 16000:
resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
speech_array = resampler(speech_array)
input_features = processor(speech_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = asr_model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Translate to English
inputs = translator_tokenizer(transcription, return_tensors="pt")
output_tokens = translator.generate(**inputs)
translated = translator_tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
return transcription.strip(), translated.strip()
# Gradio Interface
interface = gr.Interface(
fn=transcribe_and_translate,
inputs=gr.Audio(type="filepath", label="🎀 Record or Upload Odia Audio"),
outputs=[
gr.Textbox(label="πŸ“ Odia Transcription"),
gr.Textbox(label="🌐 English Translation")
],
title="Odia Whisper ASR + Translator",
description="Speak in Odia and get instant transcription + English translation using Whisper and IndicTrans2.",
)
if __name__ == "__main__":
interface.launch()