import spaces import gradio as gr from transformers import AutoModel, AutoProcessor from PIL import Image import torch import numpy as np model_name_or_path = "lyttt/VLV_captioner" model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False) model = model.to("cuda") def drop_incomplete_tail(text): sentences = text.split('.') complete_sentences = [s.strip() for s in sentences if s.strip()] if not text.strip().endswith('.'): complete_sentences = complete_sentences[:-1] return '. '.join(complete_sentences) + ('.' if complete_sentences else '') @spaces.GPU(duration=120) def caption_image(image): with torch.no_grad(): outputs = model([image], 77).generated_text[0] return outputs def greet(image): if image.dtype != np.uint8: image = (np.clip(image, 0, 1) * 255).astype(np.uint8) image = Image.fromarray(image, mode='RGB') raw_text = caption_image(image) return drop_incomplete_tail(raw_text) demo = gr.Interface(fn=greet, inputs="image", outputs="text") demo.launch()