Spaces:

nareauow
/

speaker-recognition

Sleeping

nareauow commited on May 1

Commit

c4e0b50

verified ·

1 Parent(s): 6ad5f27

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -122,16 +122,13 @@ class AudioProcessor:
         return mfcc_tensor
-# Speech recognition function
 def recognize_speech(audio_path):
     if speech_recognizer is None or speech_processor is None:
         return "Speech recognition model not available"
     try:
-        # Read audio file
         audio_data, sr = sf.read(audio_path)
-        # Resample to 16kHz if needed
         if sr != 16000:
             audio_data = np.interp(
                 np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
@@ -140,15 +137,27 @@ def recognize_speech(audio_path):
             )
             sr = 16000
-        # Process audio
-        inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate transcription
-        generated_ids = speech_recognizer.generate(**inputs)
-        transcription = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return transcription
     except Exception as e:
         return f"Speech recognition error: {str(e)}"

         return mfcc_tensor
 def recognize_speech(audio_path):
     if speech_recognizer is None or speech_processor is None:
         return "Speech recognition model not available"
     try:
         audio_data, sr = sf.read(audio_path)
         if sr != 16000:
             audio_data = np.interp(
                 np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
             )
             sr = 16000
+        inputs = speech_processor(
+            audio_data,
+            sampling_rate=sr,
+            return_tensors="pt"
+        ).to(device)
+        generated_ids = speech_recognizer.generate(
+            input_features=inputs["input_features"],
+            max_length=100,
+            num_beams=5,  # Changed from 1 to 5 for better results
+            early_stopping=True,
+            no_repeat_ngram_size=2
+        )
+        transcription = speech_processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0]
+        return transcription.strip()
     except Exception as e:
         return f"Speech recognition error: {str(e)}"