Spaces:

k2-fsa
/

source-separation

Running

csukuangfj commited on Jun 6

Commit

33085cc

1 Parent(s): 5856dbb

minor fixes

Files changed (2) hide show

app.py CHANGED Viewed

@@ -107,11 +107,11 @@ def process(model_name, in_filename: str):
     logging.info(f"model_name: {model_name}")
     logging.info(f"in_filename: {in_filename}")
-    waveform = load_audio(in_filename)
-    waveform = np.transpose(waveform)
-    waveform = np.ascontiguousarray(waveform)
-    duration = waveform.shape[1] / 44100  # in seconds
     sp = load_model(model_name)
@@ -121,7 +121,7 @@ def process(model_name, in_filename: str):
     start = time.time()
-    output = sp.process(sample_rate=44100, samples=waveform)
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
@@ -154,6 +154,17 @@ def process(model_name, in_filename: str):
 title = "# Source separation with Next-gen Kaldi"
 # css style is copied from
 # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
@@ -172,9 +183,9 @@ with demo:
     gr.Markdown(title)
     model_dropdown = gr.Dropdown(
-        choices=model_list[model_list[0]],
         label="Select a model",
-        value=model_list[model_list[0]],
     )
     with gr.Tabs():
@@ -259,6 +270,7 @@ with demo:
             inputs=[model_dropdown, url_textbox],
             outputs=[url_vocals, url_non_vocals, url_html_info],
         )
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

     logging.info(f"model_name: {model_name}")
     logging.info(f"in_filename: {in_filename}")
+    samples, sample_rate = load_audio(in_filename)
+    samples = np.transpose(samples)
+    samples = np.ascontiguousarray(samples)
+    duration = samples.shape[1] / sample_rate  # in seconds
     sp = load_model(model_name)
     start = time.time()
+    output = sp.process(sample_rate=sample_rate, samples=samples)
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
 title = "# Source separation with Next-gen Kaldi"
+description = """
+This space shows how to do source separation with Next-gen Kaldi.
+It is running on CPU within a docker container provided by Hugging Face.
+See more information by visiting the following links:
+- <https://github.com/k2-fsa/sherpa-onnx>
+Everything is open-sourced.
+"""
 # css style is copied from
 # https://huggingface.co/spaces/alphacep/asr/blob/main/app.py#L113
     gr.Markdown(title)
     model_dropdown = gr.Dropdown(
+        choices=model_list[0],
         label="Select a model",
+        value=model_list[0],
     )
     with gr.Tabs():
             inputs=[model_dropdown, url_textbox],
             outputs=[url_vocals, url_non_vocals, url_html_info],
         )
+    gr.Markdown(description)
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

separate.py CHANGED Viewed

@@ -1,39 +1,45 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 from functools import lru_cache
-import ffmpeg
 import numpy as np
-from huggingface_hub import hf_hub_download
 import sherpa_onnx
-sample_rate = 44100
-def load_audio(filename):
-    probe = ffmpeg.probe(filename)
-    if "streams" not in probe or len(probe["streams"]) == 0:
-        raise ValueError("No stream was found with ffprobe")
-    metadata = next(
-        stream for stream in probe["streams"] if stream["codec_type"] == "audio"
-    )
-    n_channels = metadata["channels"]
-    process = (
-        ffmpeg.input(filename)
-        .output("pipe:", format="f32le", ar=sample_rate)
-        .run_async(pipe_stdout=True, pipe_stderr=True)
-    )
-    buffer, _ = process.communicate()
-    waveform = np.frombuffer(buffer, dtype="<f4").reshape(-1, n_channels)
-    if n_channels > 2:
-        waveform = waveform[:, :2]
-    return waveform
 @lru_cache(maxsize=10)

 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
+import logging
+import os
 from functools import lru_cache
 import numpy as np
 import sherpa_onnx
+import soundfile as sf
+from huggingface_hub import hf_hub_download
+import uuid
+def convert_to_wav(in_filename: str) -> str:
+    """Convert the input audio file to a wave file"""
+    out_filename = str(uuid.uuid4())
+    out_filename = f"{in_filename}.wav"
+    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 441000 -ac 2 '{out_filename}' -y"
+    )
+    return out_filename
+def load_audio(filename):
+    filename = convert_to_wav(filename)
+    samples, sample_rate = sf.read(filename, dtype="float32", always_2d=True)
+    samples = np.transpose(samples)
+    # now samples is of shape (num_channels, num_samples)
+    assert (
+        samples.shape[1] > samples.shape[0]
+    ), f"You should use (num_channels, num_samples). {samples.shape}"
+    assert (
+        samples.dtype == np.float32
+    ), f"Expect np.float32 as dtype. Given: {samples.dtype}"
+    return samples, sample_rate
 @lru_cache(maxsize=10)