Spaces:

DroolingPanda
/

teachingAssistant

Build error

Michael Hu commited on Jul 24

Commit

b591083

1 Parent(s): 6f92dbc

refactor(stt): replace whisper with faster-whisper for improved performance

Switch from transformers-based whisper implementation to faster-whisper for better speed and memory efficiency. The new implementation removes torch dependency for device detection and uses optimized compute types based on available hardware.

Files changed (1) hide show

utils/stt.py +38 -46

utils/stt.py CHANGED Viewed

@@ -11,10 +11,8 @@ from abc import ABC, abstractmethod
 logger = logging.getLogger(__name__)
-import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 from pydub import AudioSegment
-import soundfile as sf
 class ASRModel(ABC):
     """Base class for ASR models"""
@@ -43,64 +41,58 @@ class ASRModel(ABC):
 class WhisperModel(ASRModel):
-    """Whisper ASR model implementation"""
     def __init__(self):
         self.model = None
-        self.processor = None
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def load_model(self):
-        """Load Whisper model"""
-        logger.info("Loading Whisper model")
         logger.info(f"Using device: {self.device}")
-        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            "openai/whisper-large-v3",
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True,
-            use_safetensors=True
-        ).to(self.device)
-        self.processor = AutoProcessor.from_pretrained("unsloth/whisper-large-v3")
-        logger.info("Whisper model loaded successfully")
     def transcribe(self, audio_path):
-        """Transcribe audio using Whisper"""
-        if self.model is None or self.processor is None:
             self.load_model()
         wav_path = self.preprocess_audio(audio_path)
-        # Processing
-        logger.info("Processing audio input")
-        logger.debug("Loading audio data")
-        audio_data, sample_rate = sf.read(wav_path)
-        audio_data = audio_data.astype(np.float32)
-        # Increase chunk length and stride for longer transcriptions
-        inputs = self.processor(
-            audio_data,
-            sampling_rate=16000,
-            return_tensors="pt",
-            # Increase chunk length to handle longer segments
-            chunk_length_s=60,
-            stride_length_s=10
-        ).to(self.device)
-        # Transcription
-        logger.info("Generating transcription")
-        with torch.no_grad():
-            # Add max_length parameter to allow for longer outputs
-            outputs = self.model.generate(
-                **inputs,
-                language="en",
-                task="transcribe",
-                max_length=448,  # Explicitly set max output length
-                no_repeat_ngram_size=3  # Prevent repetition in output
-            )
-        result = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
         logger.info(f"Transcription completed successfully")
         return result

 logger = logging.getLogger(__name__)
+from faster_whisper import WhisperModel as FasterWhisperModel
 from pydub import AudioSegment
 class ASRModel(ABC):
     """Base class for ASR models"""
 class WhisperModel(ASRModel):
+    """Faster Whisper ASR model implementation"""
     def __init__(self):
         self.model = None
+        # Check for CUDA availability without torch dependency
+        try:
+            import torch
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        except ImportError:
+            # Fallback to CPU if torch is not available
+            self.device = "cpu"
+        self.compute_type = "float16" if self.device == "cuda" else "int8"
     def load_model(self):
+        """Load Faster Whisper model"""
+        logger.info("Loading Faster Whisper model")
         logger.info(f"Using device: {self.device}")
+        logger.info(f"Using compute type: {self.compute_type}")
+        # Use large-v3 model with appropriate compute type based on device
+        self.model = FasterWhisperModel(
+            "large-v3",
+            device=self.device,
+            compute_type=self.compute_type
+        )
+        logger.info("Faster Whisper model loaded successfully")
     def transcribe(self, audio_path):
+        """Transcribe audio using Faster Whisper"""
+        if self.model is None:
             self.load_model()
         wav_path = self.preprocess_audio(audio_path)
+        # Transcription with Faster Whisper
+        logger.info("Generating transcription with Faster Whisper")
+        segments, info = self.model.transcribe(
+            wav_path,
+            beam_size=5,
+            language="en",
+            task="transcribe"
+        )
+        logger.info(f"Detected language '{info.language}' with probability {info.language_probability}")
+        # Collect all segments into a single text
+        result_text = ""
+        for segment in segments:
+            result_text += segment.text + " "
+            logger.debug(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
+        result = result_text.strip()
         logger.info(f"Transcription completed successfully")
         return result