VLV_Caption

Sleeping

App Files Files Community

lambertxiao commited on 26 days ago

Commit

c50c7e8

verified ·

1 Parent(s): 816d008

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -19

app.py CHANGED Viewed

@@ -1,42 +1,44 @@
-import spaces                    # NEW ─ brings the decorator into scope
 import gradio as gr
-from transformers import AutoModel, AutoProcessor
 from PIL import Image
 import torch, numpy as np
 model_name_or_path = "lyttt/VLV_captioner"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model      = AutoModel.from_pretrained(
     model_name_or_path,
     revision="master",
     trust_remote_code=True,
     low_cpu_mem_usage=False
 ).to(device)
-processor  = AutoProcessor.from_pretrained(
-    model_name_or_path,
-    revision="master",
-    trust_remote_code=True,
-)
-@spaces.GPU(duration=120)        # now the decorator resolves
 def greet(image):
-    if image.dtype != np.uint8:
         image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
     image_pil = Image.fromarray(image, mode="RGB")
-    # preprocess + generate
-    inputs   = processor(images=[image_pil], return_tensors="pt").to(device)
     with torch.no_grad():
-        ids   = model.generate(**inputs, max_new_tokens=64)
-    text      = processor.decode(ids[0], skip_special_tokens=True)
-    # drop unfinished last sentence
-    sentences = [s.strip() for s in text.split('.') if s.strip()]
-    if not text.strip().endswith('.'):
-        sentences = sentences[:-1]
-    return '. '.join(sentences) + ('.' if sentences else '')
 demo = gr.Interface(fn=greet, inputs="image", outputs="text")
 demo.launch()

+# If this really is a HF Space, keep the next import;
+# otherwise comment it out and delete the decorator line below.
+import spaces                           # <─ ONLY needed in a Space
 import gradio as gr
+from transformers import AutoModel
 from PIL import Image
 import torch, numpy as np
 model_name_or_path = "lyttt/VLV_captioner"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModel.from_pretrained(
     model_name_or_path,
     revision="master",
     trust_remote_code=True,
     low_cpu_mem_usage=False
 ).to(device)
+def drop_incomplete_tail(text: str) -> str:
+    """Remove any unfinished sentence fragment at the end of `text`."""
+    sentences = [s.strip() for s in text.split('.') if s.strip()]
+    if not text.strip().endswith('.'):
+        sentences = sentences[:-1]
+    return '. '.join(sentences) + ('.' if sentences else '')
+# ───────────────────────────────────────────────────────────────
+@spaces.GPU(duration=120)              # ← delete this line if **not** in a Space
 def greet(image):
+    if image.dtype != np.uint8:                         # gradio gives float arr 0-1
         image = (np.clip(image, 0, 1) * 255).astype(np.uint8)
     image_pil = Image.fromarray(image, mode="RGB")
+    # The VLV-captioner accepts a list of PIL images directly.
     with torch.no_grad():
+        # Second arg is max-new-tokens (kept from your original code).
+        raw = model([image_pil], 300)
+        text = raw.generated_text[0] if hasattr(raw, "generated_text") else raw[0]
+    return drop_incomplete_tail(text)
+# ───────────────────────────────────────────────────────────────
 demo = gr.Interface(fn=greet, inputs="image", outputs="text")
 demo.launch()