import spaces import os import tempfile import gradio as gr from dotenv import load_dotenv import torch from scipy.io.wavfile import write from diffusers import DiffusionPipeline from transformers import pipeline from pathlib import Path from PIL import Image # <-- Required for new model import io # <-- Required for new model # --- Setup Models and Device --- load_dotenv() hf_token = os.getenv("HF_TKN") # Use GPU if available, otherwise CPU device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Correctly initialize the modern, reliable captioning pipeline captioning_pipeline = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-large", device=device ) print("Image captioning pipeline loaded.") # Initialize the audio pipeline. Use float16 for less VRAM on GPU. pipe = DiffusionPipeline.from_pretrained( "cvssp/audioldm2", torch_dtype=torch.float16 if device == "cuda" else torch.float32, ) print("Audio generation pipeline loaded.") # --- Core Functions --- @spaces.GPU(duration=120) def analyze_image_with_free_model(image_file_bytes): """Takes image bytes and returns a caption.""" try: print("Received image bytes, opening with Pillow...") # Open the image data directly from memory using Pillow image = Image.open(io.BytesIO(image_file_bytes)).convert("RGB") print("Generating caption...") results = captioning_pipeline(image) if not results or not isinstance(results, list): print("ERROR: Caption generation returned invalid results.") return "Error: Could not generate caption.", True caption = results[0].get("generated_text", "").strip() if not caption: print("ERROR: Generated caption is empty.") return "No caption was generated.", True print(f"Successfully generated caption: {caption}") return caption, False except Exception as e: print(f"!!!!!! EXCEPTION in analyze_image_with_free_model: {e}") return f"Error analyzing image: {e}", True @spaces.GPU(duration=120) def get_audioldm_from_caption(caption): """Takes a text caption and returns a filepath to a generated WAV file.""" try: # Move the large audio pipeline to the GPU only when it's being used pipe.to(device) print(f"Generating audio for prompt: '{caption}'") audio_output = pipe( prompt=caption, num_inference_steps=25, # Fewer steps for faster generation guidance_scale=7.0 ).audios[0] # Move the pipeline back to CPU to free up GPU memory for others pipe.to("cpu") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: print(f"Saving audio to temporary file: {temp_wav.name}") # write(file, sample_rate, data) write(temp_wav.name, 16000, audio_output) return temp_wav.name except Exception as e: print(f"!!!!!! EXCEPTION in get_audioldm_from_caption: {e}") return None # --- Gradio Interface --- css = """ #col-container{ margin: 0 auto; max-width: 800px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML("""

🎶 Generate Sound Effects from Image

⚡ Powered by Bilsimaging

""") gr.Markdown(""" 1. **Upload an image**. 2. Click **Generate Description**. 3. Click **Generate Sound Effect**. """) image_upload = gr.File(label="Upload Image", type="binary") generate_description_button = gr.Button("Generate Description", variant="primary") caption_display = gr.Textbox(label="Image Description", interactive=False) generate_sound_button = gr.Button("Generate Sound Effect") audio_output = gr.Audio(label="Generated Sound Effect") gr.Markdown(""" ## 👥 Contribute & Support For support, questions, or to contribute, please contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com). Support our work and get involved by donating through [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua """) # --- Event Handlers --- def update_caption(image_bytes): """Wrapper function for the button click.""" if image_bytes is None: return "Please upload an image first." description, _ = analyze_image_with_free_model(image_bytes) return description def generate_sound(description): """Wrapper function for the button click.""" if not description or description.startswith("Error"): gr.Warning("Cannot generate sound without a valid description!") return None audio_path = get_audioldm_from_caption(description) if audio_path is None: gr.Error("Failed to generate audio. Please check the logs.") return audio_path generate_description_button.click( fn=update_caption, inputs=image_upload, outputs=caption_display ) generate_sound_button.click( fn=generate_sound, inputs=caption_display, outputs=audio_output ) gr.HTML('

') # Launch the app. `share=True` is not needed on Spaces. demo.launch()