import torch import gradio as gr from diffusers import ( StableDiffusionPipeline, StableDiffusionInstructPix2PixPipeline, StableVideoDiffusionPipeline, WanPipeline, ) from diffusers.utils import export_to_video, load_image # Detect device & dtype device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 # Factory to load & offload a pipeline def make_pipe(cls, model_id, **kwargs): pipe = cls.from_pretrained(model_id, torch_dtype=dtype, **kwargs) # Enables CPU offload of model parts not in use pipe.enable_model_cpu_offload() return pipe # Hold pipelines in globals but don’t load yet TXT2IMG_PIPE = None IMG2IMG_PIPE = None TXT2VID_PIPE = None IMG2VID_PIPE = None def generate_image_from_text(prompt): global TXT2IMG_PIPE if TXT2IMG_PIPE is None: TXT2IMG_PIPE = make_pipe( StableDiffusionPipeline, "stabilityai/stable-diffusion-2-1-base" ).to(device) return TXT2IMG_PIPE(prompt, num_inference_steps=20).images[0] def generate_image_from_image_and_prompt(image, prompt): global IMG2IMG_PIPE if IMG2IMG_PIPE is None: IMG2IMG_PIPE = make_pipe( StableDiffusionInstructPix2PixPipeline, "timbrooks/instruct-pix2pix" ).to(device) out = IMG2IMG_PIPE(prompt=prompt, image=image, num_inference_steps=8) return out.images[0] def generate_video_from_text(prompt): global TXT2VID_PIPE if TXT2VID_PIPE is None: TXT2VID_PIPE = make_pipe( WanPipeline, "Wan-AI/Wan2.1-T2V-1.3B-Diffusers" ).to(device) frames = TXT2VID_PIPE(prompt=prompt, num_frames=12).frames[0] return export_to_video(frames, "wan_video.mp4", fps=8) def generate_video_from_image(image): global IMG2VID_PIPE if IMG2VID_PIPE is None: IMG2VID_PIPE = make_pipe( StableVideoDiffusionPipeline, "stabilityai/stable-video-diffusion-img2vid-xt", variant="fp16" if dtype==torch.float16 else None ).to(device) image = load_image(image).resize((512, 288)) frames = IMG2VID_PIPE(image, num_inference_steps=16).frames[0] return export_to_video(frames, "svd_video.mp4", fps=8) with gr.Blocks() as demo: gr.Markdown("# 🧠 Lightweight Any‑to‑Any AI Playground") with gr.Tab("Text → Image"): inp = gr.Textbox(label="Prompt") out = gr.Image() gr.Button("Generate").click(generate_image_from_text, inp, out) with gr.Tab("Image → Image"): img = gr.Image(label="Input Image") prm = gr.Textbox(label="Edit Prompt") out2 = gr.Image() gr.Button("Generate").click(generate_image_from_image_and_prompt, [img, prm], out2) with gr.Tab("Text → Video"): inp2 = gr.Textbox(label="Prompt") out_vid = gr.Video() gr.Button("Generate").click(generate_video_from_text, inp2, out_vid) with gr.Tab("Image → Video"): img2 = gr.Image(label="Input Image") out_vid2 = gr.Video() gr.Button("Animate").click(generate_video_from_image, img2, out_vid2) demo.launch()