VLV_Caption

Sleeping

File size: 1,098 Bytes

ddebdd0
d1b0574
ddebdd0
c65a1f7
ddebdd0
 
d1b0574
ea1edb1
ddebdd0
5e14c45
ea1edb1
ddebdd0
 
 
c50c7e8
ddebdd0
 
c50c7e8
ddebdd0
 
c65a1f7
bf55b92
ddebdd0
816d008
ddebdd0
 
 
 
 
 
c65a1f7
 
ddebdd0

import spaces
import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import numpy as np

model_name_or_path = "lyttt/VLV_captioner"
model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False)
model = model.to("cuda")

def drop_incomplete_tail(text):
    sentences = text.split('.')
    complete_sentences = [s.strip() for s in sentences if s.strip()]
    if not text.strip().endswith('.'):
        complete_sentences = complete_sentences[:-1]
    return '. '.join(complete_sentences) + ('.' if complete_sentences else '')

@spaces.GPU(duration=120)
def caption_image(image):
    with torch.no_grad():
      outputs = model([image], 77).generated_text[0]
    return outputs

def greet(image):
    if image.dtype != np.uint8:
        image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
    image = Image.fromarray(image, mode='RGB')
    raw_text = caption_image(image)
    return drop_incomplete_tail(raw_text)

demo = gr.Interface(fn=greet, inputs="image", outputs="text")
demo.launch()