Spaces:

fffiloni
/

Image2SFX-comparison

Running

fffiloni commited on Jun 5

Commit

1f28ca8

verified ·

1 Parent(s): ba71b21

MCP ready

Files changed (1) hide show

app.py CHANGED Viewed

@@ -184,6 +184,24 @@ def get_ezaudio(prompt):
         raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
 def infer(image_in, chosen_model):
     caption = get_caption_from_kosmos(image_in)
     if chosen_model == "MAGNet" :
         magnet_result = get_magnet(caption)
@@ -252,4 +270,4 @@ with gr.Blocks(css=css) as demo:
         outputs=[audio_o],
     )
-demo.queue(max_size=10).launch(debug=True, show_error=True)

         raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
 def infer(image_in, chosen_model):
+    """
+    Generate an audio clip (sound effect) from an input image using the selected generative model.
+    This function first generates a caption from the provided image using a vision-language model.
+    The caption is then used as a text prompt for various audio generation models.
+    Args:
+        image_in (str): File path to the input image. The image will be processed to generate a descriptive caption.
+        chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open".
+    Returns:
+        str | dict: The path or result object of the generated audio clip, depending on the model used.
+        If the model returns a list or a URL, the function provides that as output.
+    Example usage:
+        >>> infer("cat.png", "AudioLDM-2")
+        "outputs/audio/cat_sfx.wav"
+    """
     caption = get_caption_from_kosmos(image_in)
     if chosen_model == "MAGNet" :
         magnet_result = get_magnet(caption)
         outputs=[audio_o],
     )
+demo.queue(max_size=10).launch(debug=True, show_error=True, ssr_mode=False, mcp_server=True)