ashpikachu2k1 commited on
Commit
d21bd17
Β·
verified Β·
1 Parent(s): 168f145

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -34
app.py CHANGED
@@ -1,50 +1,61 @@
1
- import gradio as gr
2
  import torch
3
- from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Load Whisper Odia ASR model (fine-tuned)
6
- asr_model = WhisperForConditionalGeneration.from_pretrained("ashutoshpattnaik50/whisper-small-odia-finetuned")
7
- processor = WhisperProcessor.from_pretrained("openai/whisper-small")
8
 
9
- # Load IndicTrans2 Odia-to-English model
10
- translator = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en")
11
- translator_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en")
12
 
13
- # Transcription + Translation function
14
- def transcribe_and_translate(audio_path):
15
- # Load and preprocess audio
16
- from datasets import load_dataset, Audio
17
- import numpy as np
18
- import torchaudio
19
 
20
- speech_array, sampling_rate = torchaudio.load(audio_path)
21
- if sampling_rate != 16000:
22
- resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
23
- speech_array = resampler(speech_array)
24
-
25
- input_features = processor(speech_array.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
26
 
27
- predicted_ids = asr_model.generate(input_features)
28
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
29
 
30
- # Translate to English
31
- inputs = translator_tokenizer(transcription, return_tensors="pt")
32
- output_tokens = translator.generate(**inputs)
33
- translated = translator_tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
34
 
35
- return transcription.strip(), translated.strip()
 
36
 
37
- # Gradio Interface
38
  interface = gr.Interface(
39
- fn=transcribe_and_translate,
40
- inputs=gr.Audio(type="filepath", label="🎀 Record or Upload Odia Audio"),
41
  outputs=[
42
  gr.Textbox(label="πŸ“ Odia Transcription"),
43
  gr.Textbox(label="🌐 English Translation")
44
  ],
45
- title="Odia Whisper ASR + Translator",
46
- description="Speak in Odia and get instant transcription + English translation using Whisper and IndicTrans2.",
47
  )
48
 
49
- if __name__ == "__main__":
50
- interface.launch()
 
 
1
  import torch
2
+ import torchaudio
3
+ import gradio as gr
4
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+
9
+ # Load Whisper fine-tuned Odia model
10
+ whisper_model_path = "./whisper-odia-final" # Change if needed
11
+ processor = WhisperProcessor.from_pretrained(whisper_model_path)
12
+ model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path).to(device)
13
+
14
+ # Load IndicTrans2 multilingual model
15
+ trans_model_id = "ai4bharat/indictrans2-en-indic-dist-200M "
16
+ translator_tokenizer = AutoTokenizer.from_pretrained(trans_model_id, use_fast=False)
17
+ translator_model = AutoModelForSeq2SeqLM.from_pretrained(trans_model_id).to(device)
18
+
19
+ # Translation function with language tags
20
+ def translate_to_english(text):
21
+ if not text.strip():
22
+ return ""
23
 
24
+ # Add source and target language tokens
25
+ text_with_lang = f"<2en> {text.strip()}"
26
+ inputs = translator_tokenizer(text_with_lang, return_tensors="pt", padding=True).to(device)
27
 
28
+ output = translator_model.generate(**inputs, max_length=256)
29
+ translated = translator_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
30
+ return translated
31
 
32
+ # ASR + Translation Pipeline
33
+ def transcribe(audio_path):
34
+ if audio_path is None:
35
+ return "No audio received.", ""
 
 
36
 
37
+ speech, sr = torchaudio.load(audio_path)
38
+ if sr != 16000:
39
+ speech = torchaudio.functional.resample(speech, sr, 16000)
 
 
 
40
 
41
+ input_features = processor(speech.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)
 
42
 
43
+ predicted_ids = model.generate(input_features)
44
+ odia_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
 
45
 
46
+ english_text = translate_to_english(odia_text)
47
+ return odia_text, english_text
48
 
49
+ # Gradio UI
50
  interface = gr.Interface(
51
+ fn=transcribe,
52
+ inputs=gr.Audio(source="microphone", type="filepath", label="🎀 Record or Upload Odia Audio"),
53
  outputs=[
54
  gr.Textbox(label="πŸ“ Odia Transcription"),
55
  gr.Textbox(label="🌐 English Translation")
56
  ],
57
+ title="Whisper Odia ASR + Translation",
58
+ description="πŸŽ™οΈ Speak in Odia β†’ Get Odia transcription β†’ Get English translation using IndicTrans2"
59
  )
60
 
61
+ interface.launch()