import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import streamlit as st from pydub import AudioSegment import os import soundfile as sf import uuid # Set device and dtype device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load Whisper model from Hugging Face @st.cache_resource def load_model(): model_id = "openai/whisper-large-v2" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) return pipe, processor # Load model and processor pipe, processor = load_model() # Streamlit UI st.title("Hindi Audio to Text Transcription") uploaded_file = st.file_uploader( "Upload a .wav audio file for transcription", type=["wav"] ) if uploaded_file is not None: st.info("Processing uploaded file...") temp_filename = f"temp_audio_{uuid.uuid4()}.wav" with open(temp_filename, "wb") as f: f.write(uploaded_file.read()) # Preprocess the audio sound = AudioSegment.from_file(temp_filename) sound = sound.set_channels(1) # Convert to mono sound.export(temp_filename, format="wav") # Save the processed file audio, _ = sf.read(temp_filename) # Read audio data # Preprocess the audio for the model inputs = processor(audio, sampling_rate=16000, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} # Perform transcription with torch.no_grad(): outputs = pipe.model.generate(**inputs) transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0] # Display the transcription st.success("Transcription complete!") st.markdown(f"### Transcription:\n\n{transcription}") os.remove(temp_filename) # Clean up temporary file else: st.warning("Please upload a .wav file to start transcription.")