fffiloni commited on
Commit
1f28ca8
·
verified ·
1 Parent(s): ba71b21
Files changed (1) hide show
  1. app.py +19 -1
app.py CHANGED
@@ -184,6 +184,24 @@ def get_ezaudio(prompt):
184
  raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
185
 
186
  def infer(image_in, chosen_model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  caption = get_caption_from_kosmos(image_in)
188
  if chosen_model == "MAGNet" :
189
  magnet_result = get_magnet(caption)
@@ -252,4 +270,4 @@ with gr.Blocks(css=css) as demo:
252
  outputs=[audio_o],
253
  )
254
 
255
- demo.queue(max_size=10).launch(debug=True, show_error=True)
 
184
  raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
185
 
186
  def infer(image_in, chosen_model):
187
+ """
188
+ Generate an audio clip (sound effect) from an input image using the selected generative model.
189
+
190
+ This function first generates a caption from the provided image using a vision-language model.
191
+ The caption is then used as a text prompt for various audio generation models.
192
+
193
+ Args:
194
+ image_in (str): File path to the input image. The image will be processed to generate a descriptive caption.
195
+ chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open".
196
+
197
+ Returns:
198
+ str | dict: The path or result object of the generated audio clip, depending on the model used.
199
+ If the model returns a list or a URL, the function provides that as output.
200
+
201
+ Example usage:
202
+ >>> infer("cat.png", "AudioLDM-2")
203
+ "outputs/audio/cat_sfx.wav"
204
+ """
205
  caption = get_caption_from_kosmos(image_in)
206
  if chosen_model == "MAGNet" :
207
  magnet_result = get_magnet(caption)
 
270
  outputs=[audio_o],
271
  )
272
 
273
+ demo.queue(max_size=10).launch(debug=True, show_error=True, ssr_mode=False, mcp_server=True)