import gradio as gr import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import time Image.MAX_IMAGE_PIXELS = None # disable pillow’s size limit processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") def caption(img, min_new, max_new): raw_image = Image.open(img).convert('RGB') raw_image.thumbnail((1024, 1024)) inputs = processor(raw_image, return_tensors="pt") out = model.generate( **inputs, min_new_tokens=min_new, max_new_tokens=max_new ) return processor.decode(out[0], skip_special_tokens=True) def greet(img, min_new, max_new): if img is None: return "❌ Please upload an image." start = time.time() try: result = caption(img, min_new, max_new) except Exception as e: return f"⚠️ Error: {e}" elapsed = time.time() - start return f"{result}\n⏱ Took {elapsed:.2f} seconds" iface = gr.Interface( fn=greet, title='BLIP Image Captioning (large)', description="Uses Salesforce/blip-image-captioning-large on CPU.", inputs=[ gr.Image(type='filepath', label='Image'), gr.Slider(label='Min New Tokens', minimum=1, maximum=50, value=5), gr.Slider(label='Max New Tokens', minimum=1, maximum=100, value=20), ], outputs=gr.Textbox(label='Caption'), theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"), ) iface.launch()