Spaces:

Tonic
/

Llava-Video

Runtime error

App Files Files Community

Loads and samples video frames with accurate timestamps

by juvix - opened 5 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+65

-80

Files changed (1) hide show

app.py +65 -80

app.py CHANGED Viewed

@@ -1,105 +1,90 @@
 import spaces
 import gradio as gr
-import subprocess  # 🥲
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
-# subprocess.run(
-#     "pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git",
-#     shell=True,
-# )
 import torch
 from llava.model.builder import load_pretrained_model
 from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
-from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
-from llava.conversation import conv_templates, SeparatorStyle
 import copy
-import warnings
 from decord import VideoReader, cpu
 import numpy as np
-import tempfile
-import os
-import shutil
-#warnings.filterwarnings("ignore")
 title = "# 🙋🏻‍♂️Welcome to 🌟Tonic's 🌋📹LLaVA-Video!"
-description1 ="""The **🌋📹LLaVA-Video-7B-Qwen2** is a 7B parameter model  trained on the 🌋📹LLaVA-Video-178K dataset and the LLaVA-OneVision dataset. It is [based on the **Qwen2 language model**](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f), supporting a context window of up to 32K tokens. The model can process and interact with images, multi-images, and videos, with specific optimizations for video analysis.
-This model leverages the **SO400M vision backbone** for visual input and Qwen2 for language processing, making it highly efficient in multi-modal reasoning, including visual and video-based tasks.
-🌋📹LLaVA-Video has larger variants of [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) and [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) and with a [variant](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) only trained on the new synthetic data
-For further details, please visit the [Project Page](https://github.com/LLaVA-VL/LLaVA-NeXT) or check out the corresponding [research paper](https://arxiv.org/abs/2410.02713).
-- **Architecture**: `LlavaQwenForCausalLM`
-- **Attention Heads**: 28
-- **Hidden Layers**: 28
-- **Hidden Size**: 3584
-"""
-description2 ="""
-- **Intermediate Size**: 18944
-- **Max Frames Supported**: 64
-- **Languages Supported**: English, Chinese
-- **Image Aspect Ratio**: `anyres_max_9`
-- **Image Resolution**: Various grid resolutions
-- **Max Position Embeddings**: 32,768
-- **Vocab Size**: 152,064
-- **Model Precision**: bfloat16
-- **Hardware Used for Training**: 256 * Nvidia Tesla A100 GPUs
-"""
 join_us = """
 ## Join us :
-🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
-def load_video(video_path, max_frames_num, fps=1, force_sample=False):
-    if max_frames_num == 0:
-        return np.zeros((1, 336, 336, 3))
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     total_frame_num = len(vr)
-    video_time = total_frame_num / vr.get_avg_fps()
-    fps = round(vr.get_avg_fps()/fps)
-    frame_idx = [i for i in range(0, len(vr), fps)]
-    frame_time = [i/fps for i in frame_idx]
     if len(frame_idx) > max_frames_num or force_sample:
-        sample_fps = max_frames_num
-        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
-        frame_idx = uniform_sampled_frames.tolist()
-        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
-    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
-    spare_frames = vr.get_batch(frame_idx).asnumpy()
-    return spare_frames, frame_time, video_time
-# Load the model
 pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
 model_name = "llava_qwen"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 device_map = "auto"
 print("Loading model...")
-tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
 model.eval()
 print("Model loaded successfully!")
 @spaces.GPU
 def process_video(video_path, question):
     max_frames_num = 64
     video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
-    video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
-    video = [video]
     conv_template = "qwen_1_5"
-    time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}. Please answer the following questions related to this video."
     full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question}"
     conv = copy.deepcopy(conv_templates[conv_template])
     conv.append_message(conv.roles[0], full_question)
     conv.append_message(conv.roles[1], None)
-    prompt_question = conv.get_prompt()
-    input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
     with torch.no_grad():
         output = model.generate(
             input_ids,
@@ -109,37 +94,37 @@ def process_video(video_path, question):
             temperature=0,
             max_new_tokens=4096,
         )
-    response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
-    return response
 def gradio_interface(video_file, question):
     if video_file is None:
-        return "Please upload a video file."
-    response = process_video(video_file, question)
-    return response
 with gr.Blocks() as demo:
     gr.Markdown(title)
     with gr.Row():
-        with gr.Group():
-            gr.Markdown(description1)
-        with gr.Group():
-            gr.Markdown(description2)
     with gr.Accordion("Join Us", open=False):
         gr.Markdown(join_us)
     with gr.Row():
         with gr.Column():
-            video_input = gr.Video()
-            question_input = gr.Textbox(label="🙋🏻‍♂️User Question", placeholder="Ask a question about the video...")
-            submit_button = gr.Button("Ask🌋📹LLaVA-Video")
-        output = gr.Textbox(label="🌋📹LLaVA-Video")
-    submit_button.click(
-        fn=gradio_interface,
-        inputs=[video_input, question_input],
-        outputs=output
-    )
 if __name__ == "__main__":
-    demo.launch(show_error=True, ssr_mode = False)

 import spaces
 import gradio as gr
+import subprocess
+# Install Flash-Attention safely
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
 import torch
 from llava.model.builder import load_pretrained_model
 from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from llava.conversation import conv_templates
 import copy
 from decord import VideoReader, cpu
 import numpy as np
+# App info
 title = "# 🙋🏻‍♂️Welcome to 🌟Tonic's 🌋📹LLaVA-Video!"
+description1 ="""**🌋📹LLaVA-Video-7B-Qwen2** analyzes visual content and transcribes speech from videos. It supports fine-grained reasoning over video frames using 64 sampled keyframes."""
+description2 ="""**Max Frames**: 64 · **Languages**: English, Chinese · **Aspect Ratio**: any · **Precision**: bfloat16"""
 join_us = """
 ## Join us :
+🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)
 """
+# ---------- Load & Sample Video ----------
+def load_video(video_path, max_frames_num=64, fps=1, force_sample=True):
     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
     total_frame_num = len(vr)
+    avg_fps = vr.get_avg_fps()
+    video_time = total_frame_num / avg_fps
+    step = round(avg_fps / fps)
+    frame_idx = list(range(0, len(vr), step))
     if len(frame_idx) > max_frames_num or force_sample:
+        frame_idx = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int).tolist()
+    frame_time = [i / avg_fps for i in frame_idx]
+    frame_time_str = ", ".join([f"{t:.2f}s" for t in frame_time])
+    frames = vr.get_batch(frame_idx).asnumpy()
+    return frames, frame_time_str, video_time
+# ---------- Load LLaVA-Video Model ----------
 pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
 model_name = "llava_qwen"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 device_map = "auto"
 print("Loading model...")
+tokenizer, model, image_processor, _ = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
 model.eval()
 print("Model loaded successfully!")
+# ---------- Response Formatter ----------
+import re
+def format_response(response: str):
+    actions = re.findall(r"(\d+\.\d+s\s*-\s*\d+\.\d+s:\s*.+)", response)
+    speech = re.findall(r"(\d+\.\d+s:\s*.+)", response)
+    formatted = "**🟢 Visual Events:**\n" + "\n".join(actions) + "\n\n**🗣️ Speech Transcript:**\n" + "\n".join(speech)
+    return formatted if actions or speech else response
+# ---------- Core Inference ----------
 @spaces.GPU
 def process_video(video_path, question):
     max_frames_num = 64
     video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
+    video_tensor = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
+    video = [video_tensor]
+    # Add timing metadata to prompt
     conv_template = "qwen_1_5"
+    time_instruction = f"The video is {video_time:.2f} seconds long, and {max_frames_num} frames were uniformly sampled at these times: {frame_time}. Analyze them."
     full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question}"
     conv = copy.deepcopy(conv_templates[conv_template])
     conv.append_message(conv.roles[0], full_question)
     conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
     with torch.no_grad():
         output = model.generate(
             input_ids,
             temperature=0,
             max_new_tokens=4096,
         )
+    raw_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
+    return format_response(raw_output)
+# ---------- Gradio UI ----------
+default_prompt = (
+    "Analyze the video frame by frame. For each visible action or change (e.g., motion, expression, object, movement), "
+    "output the timestamp and what happens, like '0.0s - 0.1s: man lifts arm'. Also transcribe any spoken dialogue with timestamps in the format '0.0s: speech...'."
+)
 def gradio_interface(video_file, question):
     if video_file is None:
+        return "❗ Please upload a video."
+    return process_video(video_file, question or default_prompt)
 with gr.Blocks() as demo:
     gr.Markdown(title)
     with gr.Row():
+        gr.Markdown(description1)
+        gr.Markdown(description2)
     with gr.Accordion("Join Us", open=False):
         gr.Markdown(join_us)
     with gr.Row():
         with gr.Column():
+            video_input = gr.Video(label="📹 Upload Your Video")
+            question_input = gr.Textbox(label="🙋🏻‍♂️ Your Prompt", value=default_prompt, lines=4)
+            submit_button = gr.Button("Analyze with 🌋📹LLaVA-Video")
+        output = gr.Textbox(label="🧠 Result", lines=20)
+    submit_button.click(fn=gradio_interface, inputs=[video_input, question_input], outputs=output)
 if __name__ == "__main__":
+    demo.launch(show_error=True, ssr_mode=False)