nareauow commited on
Commit
c4e0b50
·
verified ·
1 Parent(s): 6ad5f27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -10
app.py CHANGED
@@ -122,16 +122,13 @@ class AudioProcessor:
122
 
123
  return mfcc_tensor
124
 
125
- # Speech recognition function
126
  def recognize_speech(audio_path):
127
  if speech_recognizer is None or speech_processor is None:
128
  return "Speech recognition model not available"
129
 
130
  try:
131
- # Read audio file
132
  audio_data, sr = sf.read(audio_path)
133
 
134
- # Resample to 16kHz if needed
135
  if sr != 16000:
136
  audio_data = np.interp(
137
  np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
@@ -140,15 +137,27 @@ def recognize_speech(audio_path):
140
  )
141
  sr = 16000
142
 
143
- # Process audio
144
- inputs = speech_processor(audio_data, sampling_rate=sr, return_tensors="pt")
145
- inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
146
 
147
- # Generate transcription
148
- generated_ids = speech_recognizer.generate(**inputs)
149
- transcription = speech_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
 
150
 
151
- return transcription
 
 
 
 
 
 
152
  except Exception as e:
153
  return f"Speech recognition error: {str(e)}"
154
 
 
122
 
123
  return mfcc_tensor
124
 
 
125
  def recognize_speech(audio_path):
126
  if speech_recognizer is None or speech_processor is None:
127
  return "Speech recognition model not available"
128
 
129
  try:
 
130
  audio_data, sr = sf.read(audio_path)
131
 
 
132
  if sr != 16000:
133
  audio_data = np.interp(
134
  np.linspace(0, len(audio_data), int(16000 * len(audio_data) / sr)),
 
137
  )
138
  sr = 16000
139
 
140
+ inputs = speech_processor(
141
+ audio_data,
142
+ sampling_rate=sr,
143
+ return_tensors="pt"
144
+ ).to(device)
145
 
146
+ generated_ids = speech_recognizer.generate(
147
+ input_features=inputs["input_features"],
148
+ max_length=100,
149
+ num_beams=5, # Changed from 1 to 5 for better results
150
+ early_stopping=True,
151
+ no_repeat_ngram_size=2
152
+ )
153
 
154
+ transcription = speech_processor.batch_decode(
155
+ generated_ids,
156
+ skip_special_tokens=True
157
+ )[0]
158
+
159
+ return transcription.strip()
160
+
161
  except Exception as e:
162
  return f"Speech recognition error: {str(e)}"
163