VLV_Caption

Sleeping

App Files Files Community

lambertxiao commited on 5 days ago

Commit

ddebdd0

verified ·

1 Parent(s): c50c7e8

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -33

app.py CHANGED Viewed

@@ -1,44 +1,32 @@
-# If this really is a HF Space, keep the next import;
-# otherwise comment it out and delete the decorator line below.
-import spaces                           # <─ ONLY needed in a Space
 import gradio as gr
-from transformers import AutoModel
 from PIL import Image
-import torch, numpy as np
 model_name_or_path = "lyttt/VLV_captioner"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = AutoModel.from_pretrained(
-    model_name_or_path,
-    revision="master",
-    trust_remote_code=True,
-    low_cpu_mem_usage=False
-).to(device)
-def drop_incomplete_tail(text: str) -> str:
-    """Remove any unfinished sentence fragment at the end of `text`."""
-    sentences = [s.strip() for s in text.split('.') if s.strip()]
     if not text.strip().endswith('.'):
-        sentences = sentences[:-1]
-    return '. '.join(sentences) + ('.' if sentences else '')
-# ───────────────────────────────────────────────────────────────
-@spaces.GPU(duration=120)              # ← delete this line if **not** in a Space
-def greet(image):
-    if image.dtype != np.uint8:                         # gradio gives float arr 0-1
-        image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
-    image_pil = Image.fromarray(image, mode="RGB")
-    # The VLV-captioner accepts a list of PIL images directly.
     with torch.no_grad():
-        # Second arg is max-new-tokens (kept from your original code).
-        raw = model([image_pil], 300)
-        text = raw.generated_text[0] if hasattr(raw, "generated_text") else raw[0]
-    return drop_incomplete_tail(text)
-# ───────────────────────────────────────────────────────────────
 demo = gr.Interface(fn=greet, inputs="image", outputs="text")
-demo.launch()

+import spaces
 import gradio as gr
+from transformers import AutoModel, AutoProcessor
 from PIL import Image
+import torch
+import numpy as np
 model_name_or_path = "lyttt/VLV_captioner"
+model = AutoModel.from_pretrained(model_name_or_path, revision="master", trust_remote_code=True,low_cpu_mem_usage=False)
+def drop_incomplete_tail(text):
+    sentences = text.split('.')
+    complete_sentences = [s.strip() for s in sentences if s.strip()]
     if not text.strip().endswith('.'):
+        complete_sentences = complete_sentences[:-1]
+    return '. '.join(complete_sentences) + ('.' if complete_sentences else '')
+@spaces.GPU(duration=120)
+def caption_image(image):
     with torch.no_grad():
+      outputs = model([image], 300).generated_text[0]
+    return outputs
+def greet(image):
+    if image.dtype != np.uint8:
+        image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
+    image = Image.fromarray(image, mode='RGB')
+    raw_text = caption_image(image)
+    return drop_incomplete_tail(raw_text)
 demo = gr.Interface(fn=greet, inputs="image", outputs="text")
+demo.launch()