Spaces:
Sleeping
Sleeping
import spaces | |
import gradio as gr | |
from transformers import AutoModel, AutoProcessor | |
from PIL import Image | |
import torch | |
import numpy as np | |
model_name_or_path = "lyttt/VLV_captioner" | |
model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False) | |
model = model.to("cuda") | |
def drop_incomplete_tail(text): | |
sentences = text.split('.') | |
complete_sentences = [s.strip() for s in sentences if s.strip()] | |
if not text.strip().endswith('.'): | |
complete_sentences = complete_sentences[:-1] | |
return '. '.join(complete_sentences) + ('.' if complete_sentences else '') | |
def caption_image(image): | |
with torch.no_grad(): | |
outputs = model([image], 77).generated_text[0] | |
return outputs | |
def greet(image): | |
if image.dtype != np.uint8: | |
image = (np.clip(image, 0, 1) * 255).astype(np.uint8) | |
image = Image.fromarray(image, mode='RGB') | |
raw_text = caption_image(image) | |
return drop_incomplete_tail(raw_text) | |
demo = gr.Interface(fn=greet, inputs="image", outputs="text") | |
demo.launch() |