import gradio as gr from transformers import AutoModel, AutoProcessor from PIL import Image import torch import numpy as np model_name_or_path = "lyttt/VLV_captioner" model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False) def greet(image): if image.dtype != np.uint8: image = (np.clip(image, 0, 1) * 255).astype(np.uint8) image = Image.fromarray(image, mode='RGB') with torch.no_grad(): outputs = model([image]).generated_text[0] return output demo = gr.Interface(fn=greet, inputs="image", outputs="text") demo.launch()