File size: 1,098 Bytes
ddebdd0
d1b0574
ddebdd0
c65a1f7
ddebdd0
 
d1b0574
ea1edb1
ddebdd0
5e14c45
ea1edb1
ddebdd0
 
 
c50c7e8
ddebdd0
 
c50c7e8
ddebdd0
 
c65a1f7
bf55b92
ddebdd0
816d008
ddebdd0
 
 
 
 
 
c65a1f7
 
ddebdd0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import spaces
import gradio as gr
from transformers import AutoModel, AutoProcessor
from PIL import Image
import torch
import numpy as np

model_name_or_path = "lyttt/VLV_captioner"
model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False)
model = model.to("cuda")

def drop_incomplete_tail(text):
    sentences = text.split('.')
    complete_sentences = [s.strip() for s in sentences if s.strip()]
    if not text.strip().endswith('.'):
        complete_sentences = complete_sentences[:-1]
    return '. '.join(complete_sentences) + ('.' if complete_sentences else '')

@spaces.GPU(duration=120)
def caption_image(image):
    with torch.no_grad():
      outputs = model([image], 77).generated_text[0]
    return outputs

def greet(image):
    if image.dtype != np.uint8:
        image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
    image = Image.fromarray(image, mode='RGB')
    raw_text = caption_image(image)
    return drop_incomplete_tail(raw_text)

demo = gr.Interface(fn=greet, inputs="image", outputs="text")
demo.launch()