Spaces:

abdibrahem
/

ai-agent

Sleeping

abdibrahem commited on Jun 28

Commit

1526f72

1 Parent(s): 98620f5

Update voice installation and settings

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -99,6 +99,9 @@ done\n\
 echo "Pulling gemma3 model..."\n\
 ollama pull gemma3\n\
 \n\
 echo "Starting FastAPI application..."\n\
 exec uvicorn main:app --host 0.0.0.0 --port 7860' > start.sh && \
     chmod +x start.sh

 echo "Pulling gemma3 model..."\n\
 ollama pull gemma3\n\
 \n\
+echo "Install ffmpeg..."\n\
+sudo apt install  ffmpeg\n\
+\n\
 echo "Starting FastAPI application..."\n\
 exec uvicorn main:app --host 0.0.0.0 --port 7860' > start.sh && \
     chmod +x start.sh

voice_util.py CHANGED Viewed

@@ -1,23 +1,53 @@
-import whisper
 import numpy as np
 import soundfile as sf
 import io
-from tempfile import NamedTemporaryFile
-import os
-def load_audio(file_bytes):
-    # Load audio and convert to Whisper's required format
-    audio, sr = sf.read(io.BytesIO(file_bytes))
-    # Convert to mono if stereo
     if len(audio.shape) > 1:
         audio = np.mean(audio, axis=1)
-    # Resample to 16kHz if needed
     if sr != 16000:
-        import librosa
         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-    return audio.astype(np.float32)

+# import whisper
+# import numpy as np
+# import soundfile as sf
+# import io
+# from tempfile import NamedTemporaryFile
+# import os
+# def load_audio(file_bytes):
+#     # Load audio and convert to Whisper's required format
+#     audio, sr = sf.read(io.BytesIO(file_bytes))
+#     # Convert to mono if stereo
+#     if len(audio.shape) > 1:
+#         audio = np.mean(audio, axis=1)
+#     # Resample to 16kHz if needed
+#     if sr != 16000:
+#         import librosa
+#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+#     return audio.astype(np.float32)
+from pydub import AudioSegment
 import numpy as np
 import soundfile as sf
 import io
+import librosa
+def load_audio(file_bytes):
+    # Try to load audio using pydub for format compatibility
+    try:
+        audio_segment = AudioSegment.from_file(io.BytesIO(file_bytes))
+    except Exception as e:
+        raise ValueError("Could not read audio file. Format might be unsupported.") from e
+    # Export to WAV in-memory
+    wav_io = io.BytesIO()
+    audio_segment.export(wav_io, format="wav")
+    wav_io.seek(0)
+    # Read the exported WAV using soundfile
+    audio, sr = sf.read(wav_io)
+    # Convert to mono
     if len(audio.shape) > 1:
         audio = np.mean(audio, axis=1)
+    # Resample to 16kHz
     if sr != 16000:
         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+    return audio.astype(np.float32)