ai-agent / voice_util.py
abdibrahem's picture
Update voice installation and settings
1526f72
# import whisper
# import numpy as np
# import soundfile as sf
# import io
# from tempfile import NamedTemporaryFile
# import os
# def load_audio(file_bytes):
# # Load audio and convert to Whisper's required format
# audio, sr = sf.read(io.BytesIO(file_bytes))
# # Convert to mono if stereo
# if len(audio.shape) > 1:
# audio = np.mean(audio, axis=1)
# # Resample to 16kHz if needed
# if sr != 16000:
# import librosa
# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# return audio.astype(np.float32)
from pydub import AudioSegment
import numpy as np
import soundfile as sf
import io
import librosa
def load_audio(file_bytes):
# Try to load audio using pydub for format compatibility
try:
audio_segment = AudioSegment.from_file(io.BytesIO(file_bytes))
except Exception as e:
raise ValueError("Could not read audio file. Format might be unsupported.") from e
# Export to WAV in-memory
wav_io = io.BytesIO()
audio_segment.export(wav_io, format="wav")
wav_io.seek(0)
# Read the exported WAV using soundfile
audio, sr = sf.read(wav_io)
# Convert to mono
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1)
# Resample to 16kHz
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
return audio.astype(np.float32)