abdibrahem commited on
Commit
1526f72
·
1 Parent(s): 98620f5

Update voice installation and settings

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -0
  2. voice_util.py +43 -13
Dockerfile CHANGED
@@ -99,6 +99,9 @@ done\n\
99
  echo "Pulling gemma3 model..."\n\
100
  ollama pull gemma3\n\
101
  \n\
 
 
 
102
  echo "Starting FastAPI application..."\n\
103
  exec uvicorn main:app --host 0.0.0.0 --port 7860' > start.sh && \
104
  chmod +x start.sh
 
99
  echo "Pulling gemma3 model..."\n\
100
  ollama pull gemma3\n\
101
  \n\
102
+ echo "Install ffmpeg..."\n\
103
+ sudo apt install ffmpeg\n\
104
+ \n\
105
  echo "Starting FastAPI application..."\n\
106
  exec uvicorn main:app --host 0.0.0.0 --port 7860' > start.sh && \
107
  chmod +x start.sh
voice_util.py CHANGED
@@ -1,23 +1,53 @@
1
- import whisper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import soundfile as sf
4
  import io
5
- from tempfile import NamedTemporaryFile
6
- import os
 
 
 
 
 
 
7
 
 
 
 
 
8
 
 
 
9
 
10
- def load_audio(file_bytes):
11
- # Load audio and convert to Whisper's required format
12
- audio, sr = sf.read(io.BytesIO(file_bytes))
13
-
14
- # Convert to mono if stereo
15
  if len(audio.shape) > 1:
16
  audio = np.mean(audio, axis=1)
17
-
18
- # Resample to 16kHz if needed
19
  if sr != 16000:
20
- import librosa
21
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
22
-
23
- return audio.astype(np.float32)
 
1
+ # import whisper
2
+ # import numpy as np
3
+ # import soundfile as sf
4
+ # import io
5
+ # from tempfile import NamedTemporaryFile
6
+ # import os
7
+
8
+
9
+
10
+ # def load_audio(file_bytes):
11
+ # # Load audio and convert to Whisper's required format
12
+ # audio, sr = sf.read(io.BytesIO(file_bytes))
13
+
14
+ # # Convert to mono if stereo
15
+ # if len(audio.shape) > 1:
16
+ # audio = np.mean(audio, axis=1)
17
+
18
+ # # Resample to 16kHz if needed
19
+ # if sr != 16000:
20
+ # import librosa
21
+ # audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
22
+
23
+ # return audio.astype(np.float32)
24
+ from pydub import AudioSegment
25
  import numpy as np
26
  import soundfile as sf
27
  import io
28
+ import librosa
29
+
30
+ def load_audio(file_bytes):
31
+ # Try to load audio using pydub for format compatibility
32
+ try:
33
+ audio_segment = AudioSegment.from_file(io.BytesIO(file_bytes))
34
+ except Exception as e:
35
+ raise ValueError("Could not read audio file. Format might be unsupported.") from e
36
 
37
+ # Export to WAV in-memory
38
+ wav_io = io.BytesIO()
39
+ audio_segment.export(wav_io, format="wav")
40
+ wav_io.seek(0)
41
 
42
+ # Read the exported WAV using soundfile
43
+ audio, sr = sf.read(wav_io)
44
 
45
+ # Convert to mono
 
 
 
 
46
  if len(audio.shape) > 1:
47
  audio = np.mean(audio, axis=1)
48
+
49
+ # Resample to 16kHz
50
  if sr != 16000:
 
51
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
52
+
53
+ return audio.astype(np.float32)