VLV_Caption / app.py
lambertxiao's picture
Update app.py
ddebdd0 verified
raw
history blame
1.07 kB
import spaces
import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import numpy as np
model_name_or_path = "lyttt/VLV_captioner"
model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False)
def drop_incomplete_tail(text):
sentences = text.split('.')
complete_sentences = [s.strip() for s in sentences if s.strip()]
if not text.strip().endswith('.'):
complete_sentences = complete_sentences[:-1]
return '. '.join(complete_sentences) + ('.' if complete_sentences else '')
@spaces.GPU(duration=120)
def caption_image(image):
with torch.no_grad():
outputs = model([image], 300).generated_text[0]
return outputs
def greet(image):
if image.dtype != np.uint8:
image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
image = Image.fromarray(image, mode='RGB')
raw_text = caption_image(image)
return drop_incomplete_tail(raw_text)
demo = gr.Interface(fn=greet, inputs="image", outputs="text")
demo.launch()