File size: 1,413 Bytes
1526f72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e019578
 
 
1526f72
 
 
 
 
 
 
 
e019578
1526f72
 
 
 
e019578
1526f72
 
e019578
1526f72
e019578
 
1526f72
 
e019578
 
1526f72
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# import whisper
# import numpy as np
# import soundfile as sf
# import io
# from tempfile import NamedTemporaryFile
# import os



# def load_audio(file_bytes):
#     # Load audio and convert to Whisper's required format
#     audio, sr = sf.read(io.BytesIO(file_bytes))
    
#     # Convert to mono if stereo
#     if len(audio.shape) > 1:
#         audio = np.mean(audio, axis=1)
    
#     # Resample to 16kHz if needed
#     if sr != 16000:
#         import librosa
#         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    
#     return audio.astype(np.float32)
from pydub import AudioSegment
import numpy as np
import soundfile as sf
import io
import librosa

def load_audio(file_bytes):
    # Try to load audio using pydub for format compatibility
    try:
        audio_segment = AudioSegment.from_file(io.BytesIO(file_bytes))
    except Exception as e:
        raise ValueError("Could not read audio file. Format might be unsupported.") from e

    # Export to WAV in-memory
    wav_io = io.BytesIO()
    audio_segment.export(wav_io, format="wav")
    wav_io.seek(0)

    # Read the exported WAV using soundfile
    audio, sr = sf.read(wav_io)

    # Convert to mono
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)

    # Resample to 16kHz
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

    return audio.astype(np.float32)