Spaces:
Sleeping
Sleeping
Commit
·
1526f72
1
Parent(s):
98620f5
Update voice installation and settings
Browse files- Dockerfile +3 -0
- voice_util.py +43 -13
Dockerfile
CHANGED
@@ -99,6 +99,9 @@ done\n\
|
|
99 |
echo "Pulling gemma3 model..."\n\
|
100 |
ollama pull gemma3\n\
|
101 |
\n\
|
|
|
|
|
|
|
102 |
echo "Starting FastAPI application..."\n\
|
103 |
exec uvicorn main:app --host 0.0.0.0 --port 7860' > start.sh && \
|
104 |
chmod +x start.sh
|
|
|
99 |
echo "Pulling gemma3 model..."\n\
|
100 |
ollama pull gemma3\n\
|
101 |
\n\
|
102 |
+
echo "Install ffmpeg..."\n\
|
103 |
+
sudo apt install ffmpeg\n\
|
104 |
+
\n\
|
105 |
echo "Starting FastAPI application..."\n\
|
106 |
exec uvicorn main:app --host 0.0.0.0 --port 7860' > start.sh && \
|
107 |
chmod +x start.sh
|
voice_util.py
CHANGED
@@ -1,23 +1,53 @@
|
|
1 |
-
import whisper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import soundfile as sf
|
4 |
import io
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
|
|
|
|
|
|
|
|
8 |
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
# Load audio and convert to Whisper's required format
|
12 |
-
audio, sr = sf.read(io.BytesIO(file_bytes))
|
13 |
-
|
14 |
-
# Convert to mono if stereo
|
15 |
if len(audio.shape) > 1:
|
16 |
audio = np.mean(audio, axis=1)
|
17 |
-
|
18 |
-
# Resample to 16kHz
|
19 |
if sr != 16000:
|
20 |
-
import librosa
|
21 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
22 |
-
|
23 |
-
return audio.astype(np.float32)
|
|
|
1 |
+
# import whisper
|
2 |
+
# import numpy as np
|
3 |
+
# import soundfile as sf
|
4 |
+
# import io
|
5 |
+
# from tempfile import NamedTemporaryFile
|
6 |
+
# import os
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
# def load_audio(file_bytes):
|
11 |
+
# # Load audio and convert to Whisper's required format
|
12 |
+
# audio, sr = sf.read(io.BytesIO(file_bytes))
|
13 |
+
|
14 |
+
# # Convert to mono if stereo
|
15 |
+
# if len(audio.shape) > 1:
|
16 |
+
# audio = np.mean(audio, axis=1)
|
17 |
+
|
18 |
+
# # Resample to 16kHz if needed
|
19 |
+
# if sr != 16000:
|
20 |
+
# import librosa
|
21 |
+
# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
22 |
+
|
23 |
+
# return audio.astype(np.float32)
|
24 |
+
from pydub import AudioSegment
|
25 |
import numpy as np
|
26 |
import soundfile as sf
|
27 |
import io
|
28 |
+
import librosa
|
29 |
+
|
30 |
+
def load_audio(file_bytes):
|
31 |
+
# Try to load audio using pydub for format compatibility
|
32 |
+
try:
|
33 |
+
audio_segment = AudioSegment.from_file(io.BytesIO(file_bytes))
|
34 |
+
except Exception as e:
|
35 |
+
raise ValueError("Could not read audio file. Format might be unsupported.") from e
|
36 |
|
37 |
+
# Export to WAV in-memory
|
38 |
+
wav_io = io.BytesIO()
|
39 |
+
audio_segment.export(wav_io, format="wav")
|
40 |
+
wav_io.seek(0)
|
41 |
|
42 |
+
# Read the exported WAV using soundfile
|
43 |
+
audio, sr = sf.read(wav_io)
|
44 |
|
45 |
+
# Convert to mono
|
|
|
|
|
|
|
|
|
46 |
if len(audio.shape) > 1:
|
47 |
audio = np.mean(audio, axis=1)
|
48 |
+
|
49 |
+
# Resample to 16kHz
|
50 |
if sr != 16000:
|
|
|
51 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
52 |
+
|
53 |
+
return audio.astype(np.float32)
|