import torch import torchaudio # ✅ Added torchaudio to handle audio resampling import gradio as gr import time import numpy as np import scipy.io.wavfile from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline # ✅ 1️⃣ Force Model to Run on CPU device = "cpu" torch_dtype = torch.float32 # Use CPU-friendly float type MODEL_NAME = "openai/whisper-tiny" # ✅ Switched to smallest model for fastest performance # ✅ 2️⃣ Load Whisper Tiny Model on CPU (Removed `low_cpu_mem_usage=True`) model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True # ✅ Removed low_cpu_mem_usage ) model.to(device) # ✅ 3️⃣ Load Processor & Pipeline processor = AutoProcessor.from_pretrained(MODEL_NAME) pipe = pipeline( task="automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=2, # ✅ Process in 2-second chunks for ultra-low latency torch_dtype=torch_dtype, device=device, sampling_rate=16000, # ✅ Explicitly set sampling rate to avoid resampling issues ) # ✅ 4️⃣ Real-Time Streaming Transcription (Microphone) def stream_transcribe(stream, new_chunk): start_time = time.time() try: sr, y = new_chunk # ✅ Convert stereo to mono if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) y /= np.max(np.abs(y)) # ✅ Resample audio