import gradio as gr from PIL import Image import torch from transformers import Blip2ForConditionalGeneration, AutoProcessor # Load your fine-tuned model and processor from local directories processor = AutoProcessor.from_pretrained("./processor") model = Blip2ForConditionalGeneration.from_pretrained("./model", device_map="auto", torch_dtype=torch.float16) # Inference function def generate_caption(image: Image.Image) -> str: # Convert image to RGB and process image = image.convert("RGB") inputs = processor(images=image, return_tensors="pt").to(model.device, torch.float16) # Generate caption generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=25) caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return caption # Gradio UI iface = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil"), outputs="text", title="🖼️ Image Captioning with Fine-Tuned BLIP2", description="Upload an image to generate a caption using your custom fine-tuned BLIP2 model.", ) if __name__ == "__main__": iface.launch()