Spaces:

ankandrew
/

MiMo-VL-7B

Running on Zero

App Files Files Community

ankandrew commited on 17 days ago

Commit

fa60b30

verified ·

1 Parent(s): d5e6127

Upload 2 files

Browse files

Files changed (2) hide show

app.py +277 -155
infer.py +40 -32

app.py CHANGED Viewed

@@ -1,167 +1,289 @@
 import gradio as gr
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
-from transformers.image_utils import load_image
-from threading import Thread
-import time
-import torch
 import spaces
-import cv2
-import numpy as np
-from PIL import Image
-def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a dark animated bar.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
-def downsample_video(video_path):
-    """
-    Downsamples the video to 10 evenly spaced frames.
-    Each frame is converted to a PIL Image along with its timestamp.
-    """
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    frames = []
-    if total_frames <= 0 or fps <= 0:
-        vidcap.release()
-        return frames
-    # Sample 10 evenly spaced frames.
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
-# MODEL_ID = "XiaomiMiMo/MiMo-VL-7B-RL"
-MODEL_ID = "XiaomiMiMo/MiMo-VL-7B-RL-2508"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to("cuda").eval()
-@spaces.GPU
-def model_inference(input_dict, history):
-    text = input_dict["text"]
-    files = input_dict["files"]
-    if text.strip().lower().startswith("@video-infer"):
-        # Remove the tag from the query.
-        text = text[len("@video-infer"):].strip()
-        if not files:
-            yield "⚠️ Please upload a video file along with your `@video-infer` query."
-            return
-        # Assume the first file is a video.
-        video_path = files[0]
-        frames = downsample_video(video_path)
-        if not frames:
-            yield "⚠️ Could not process the video (no frames were read)."
-            return
-        # Build messages: start with the text prompt.
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "text", "text": text}]
-            }
-        ]
-        # Append each frame with a timestamp label.
-        for image, timestamp in frames:
-            messages[0]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-            messages[0]["content"].append({"type": "image", "image": image})
-        # Collect only the images from the frames.
-        video_images = [image for image, _ in frames]
-        # Prepare the prompt.
-        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(
-            text=[prompt],
-            images=video_images,
-            return_tensors="pt",
-            padding=True,
-        ).to("cuda")
-        # Set up streaming generation.
-        streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing video with MiMo-VL-7B-RL Model")
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-            yield buffer
-        return
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
-    if text == "" and not images:
-        yield "⚠️ Please enter a question and/or upload image(s)."
-        return
-    if text == "" and images:
-        yield "⚠️ Please enter a text prompt along with the image(s)."
-        return
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ],
-        }
-    ]
-    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
-        text=[prompt],
-        images=images if images else None,
-        return_tensors="pt",
-        padding=True,
-    ).to("cuda")
-    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    yield progress_bar_html("Processing with MiMo-VL-7B-RL Model")
-    for new_text in streamer:
-        buffer += new_text
-        time.sleep(0.01)
-        yield buffer
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description="# **MiMo-VL-7B-RL (2508) `@video-infer for video understanding`**",
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
-    stop_btn="Stop Generation",
-    multimodal=True,
-)
-demo.launch(debug=True)

+# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py
+import os
 import gradio as gr
+from infer import MiMoVLInfer
 import spaces
+infer = MiMoVLInfer(checkpoint_path=os.environ.get('CKPT_PATH'))
+label_translations = {
+    "gr_chatinterface_ofl": {
+        "English": "Chatbot",
+    },
+    "gr_chatinterface_ol": {
+        "English": "Chatbot",
+    },
+    "gr_tab_ol": {
+        "English": "Online",
+    },
+    "gr_tab_ofl": {
+        "English": "Offline",
+    },
+    "gr_temperature": {
+        "English": "Temperature",
+    },
+    "gr_webcam_image": {
+        "English": "🤳 Open Webcam",
+    },
+    "gr_webcam_images": {
+        "English": "📹 Recorded Frames",
+    },
+    "gr_chatinterface_ofl.textbox.placeholder": {
+        "English":
+        "Ask me anything. You can also drop in images and .mp4 videos.",
+    },
+    "gr_chatinterface_ol.textbox.placeholder": {
+        "English": "Ask me anything...",
+    }
+}
+@spaces.GPU(duration=120)   # bump if your requests take >60s
+def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float):
+    infer.to_device("cuda")
+    try:
+        yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history
+        for response_text, infer_history in infer(inputs=gr_inputs,
+                                                  history=infer_history,
+                                                  temperature=temperature):
+            if response_text.startswith('<think>') and '</think>' not in response_text:
+                reasoning_text = response_text.lstrip('<think>')
+                response_message = [{
+                    "role": "assistant",
+                    "content": reasoning_text,
+                    'metadata': {'title': '🤔 Thinking'}
+                }]
+                yield response_message, infer_history
+            elif '<think>' in response_text and '</think>' in response_text:
+                reasoning_text, response_text2 = response_text.split('</think>', 1)
+                reasoning_text = reasoning_text.lstrip('<think>')
+                response_message = [{
+                    "role": "assistant",
+                    "content": reasoning_text,
+                    'metadata': {'title': '🤔 Thinking'}
+                }, {
+                    "role": "assistant",
+                    "content": response_text2
+                }]
+                yield response_message, infer_history
+            else:
+                yield [{"role": "assistant", "content": response_text}], infer_history
+    finally:
+        infer.to_device("cpu")
+@spaces.GPU(duration=120)
+def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int,
+                       infer_history: list, temperature: float):
+    infer.to_device("cuda")
+    try:
+        if not gr_webcam_images:
+            gr_webcam_images = []
+        gr_webcam_images = gr_webcam_images[gr_counter:]
+        inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
+        # send an immediate chunk
+        yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history
+        for response_message, infer_history in offline_chat(
+                inputs, gr_history, infer_history, temperature):
+            yield response_message, gr.skip(), infer_history
+    finally:
+        infer.to_device("cpu")
+with gr.Blocks() as demo:
+    gr.Markdown("""<center><font size=8>MiMo-7b-VL</center>""")
+    with gr.Column():
+        # gr_title = gr.Markdown('# MiMo-VL')
+        with gr.Row():
+            gr_lang_selector = gr.Dropdown(choices=["English"],
+                                           value="English",
+                                           label="🌐 Interface",
+                                           interactive=True,
+                                           min_width=250,
+                                           scale=0)
+    with gr.Tabs():
+        with gr.Tab("Offline") as gr_tab_ofl:
+            gr_infer_history = gr.State([])
+            gr_temperature_hidden = gr.Slider(minimum=0.0,
+                                              maximum=2.0,
+                                              step=0.1,
+                                              value=1.0,
+                                              interactive=True,
+                                              visible=False)
+            gr_chatinterface_ofl = gr.ChatInterface(
+                fn=offline_chat,
+                type="messages",
+                multimodal=True,
+                chatbot=gr.Chatbot(height=800),
+                textbox=gr.MultimodalTextbox(
+                    file_count="multiple",
+                    file_types=["image", ".mp4"],
+                    sources=["upload"],
+                    stop_btn=True,
+                    placeholder=label_translations[
+                        'gr_chatinterface_ofl.textbox.placeholder']['English'],
+                ),
+                additional_inputs=[
+                    gr_infer_history, gr_temperature_hidden
+                ],
+                additional_outputs=[gr_infer_history],
+            )
+            gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
+                  fn=lambda: [],
+                  outputs=[gr_infer_history])
+            with gr.Row():
+                with gr.Column(scale=1, min_width=200):
+                    gr_temperature_ofl = gr.Slider(
+                        minimum=0.0,
+                        maximum=2.0,
+                        step=0.1,
+                        value=0.4,
+                        label=label_translations['gr_temperature']['English'],
+                        interactive=True)
+                    gr_temperature_ofl.change(lambda x: x,
+                                              inputs=gr_temperature_ofl,
+                                              outputs=gr_temperature_hidden)
+                with gr.Column(scale=8):
+                    with gr.Column(visible=True) as gr_examples_en:
+                        gr.Examples(
+                            examples=[
+                                {
+                                    "text": "Who are you?",
+                                    "files": []
+                                },
+                                {
+                                    "text": "OCR and return markdown",
+                                    "files": ["examples/24-25-pl.png"]
+                                },
+                                {
+                                    "text":
+                                    """describe the video""",
+                                    "files":
+                                    ["examples/hitting_baseball.mp4"]
+                                },
+                                {
+                                    "text":
+                                    "For the model ranked first on WebSRC, what is its score on MathVision?",
+                                    "files": [
+                                        "examples/mimovl_gui.png",
+                                        "examples/mimovl_reason.png"
+                                    ]
+                                },
+                            ],
+                            inputs=[gr_chatinterface_ofl.textbox],
+                        )
+        with gr.Tab("Online") as gr_tab_ol:
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr_infer_history = gr.State([])
+                    gr_temperature_hidden = gr.Slider(minimum=0.0,
+                                                      maximum=2.0,
+                                                      step=0.1,
+                                                      value=1.0,
+                                                      interactive=True,
+                                                      visible=False)
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            gr_webcam_image = gr.Image(
+                                label=label_translations['gr_webcam_image']
+                                ['English'],
+                                sources="webcam",
+                                height=250,
+                                type='filepath')
+                            gr_webcam_images = gr.Gallery(
+                                label=label_translations['gr_webcam_images']
+                                ['English'],
+                                show_label=True,
+                                format='webp',
+                                columns=1,
+                                height=250,
+                                preview=True,
+                                interactive=False)
+                            gr_counter = gr.Number(value=0, visible=False)
+                        with gr.Column(scale=3):
+                            gr_chatinterface_ol = gr.ChatInterface(
+                                fn=online_record_chat,
+                                type="messages",
+                                multimodal=False,
+                                chatbot=gr.Chatbot(height=800),
+                                textbox=gr.
+                                Textbox(placeholder=label_translations[
+                                    'gr_chatinterface_ol.textbox.placeholder']
+                                        ['English'],
+                                        submit_btn=True,
+                                        stop_btn=True),
+                                additional_inputs=[
+                                    gr_webcam_images, gr_counter,
+                                    gr_infer_history, gr_temperature_hidden
+                                ],
+                                additional_outputs=[
+                                    gr_counter, gr_infer_history
+                                ],
+                            )
+                            def cache_webcam(recorded_image: str,
+                                             recorded_images: list):
+                                if not recorded_images:
+                                    recorded_images = []
+                                return recorded_images + [recorded_image]
+                            gr_webcam_image.stream(
+                                fn=cache_webcam,
+                                inputs=[gr_webcam_image, gr_webcam_images],
+                                outputs=[gr_webcam_images],
+                                stream_every=1,
+                                concurrency_limit=30,
+                            )
+                            with gr.Row():
+                                gr_temperature_ol = gr.Slider(
+                                    minimum=0.0,
+                                    maximum=2.0,
+                                    step=0.1,
+                                    value=0.4,
+                                    label=label_translations['gr_temperature']
+                                    ['English'],
+                                    interactive=True)
+                                gr_temperature_ol.change(
+                                    lambda x: x,
+                                    inputs=gr_temperature_ol,
+                                    outputs=gr_temperature_hidden)
+    def update_lang(lang: str):
+        return (
+            gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
+            gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
+            gr.update(placeholder=label_translations[
+                'gr_chatinterface_ofl.textbox.placeholder'][lang]),
+            gr.update(placeholder=label_translations[
+                'gr_chatinterface_ol.textbox.placeholder'][lang]),
+            gr.update(label=label_translations['gr_tab_ofl'][lang]),
+            gr.update(label=label_translations['gr_tab_ol'][lang]),
+            gr.update(label=label_translations['gr_temperature'][lang]),
+            gr.update(label=label_translations['gr_temperature'][lang]),
+            gr.update(visible=lang == 'English'),
+            gr.update(visible=lang != 'English'),
+            gr.update(label=label_translations['gr_webcam_image'][lang]),
+            gr.update(label=label_translations['gr_webcam_images'][lang]),
+        )
+    gr_lang_selector.change(fn=update_lang,
+                            inputs=[gr_lang_selector],
+                            outputs=[
+                                gr_chatinterface_ofl.chatbot,
+                                gr_chatinterface_ol.chatbot,
+                                gr_chatinterface_ofl.textbox,
+                                gr_chatinterface_ol.textbox,
+                                gr_tab_ofl,
+                                gr_tab_ol,
+                                gr_temperature_ofl,
+                                gr_temperature_ol,
+                                gr_examples_en,
+                                gr_webcam_image,
+                                gr_webcam_images,
+                            ])
+demo.queue(default_concurrency_limit=2, max_size=50)
+if __name__ == "__main__":
+    demo.launch()

infer.py CHANGED Viewed

@@ -1,4 +1,6 @@
-# modified from https://github.com/ByteDance-Seed/Seed1.5-VL/blob/main/GradioDemo/infer.py
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.generation.stopping_criteria import EosTokenCriteria, StoppingCriteriaList
 from qwen_vl_utils import process_vision_info
@@ -6,67 +8,73 @@ from threading import Thread
 class MiMoVLInfer:
-    def __init__(self, checkpoint_path, device='cuda', **kwargs):
         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            checkpoint_path, torch_dtype='auto', device_map=device, attn_implementation='flash_attention_2',
-        )
-        self.processor = AutoProcessor.from_pretrained(checkpoint_path)
     def __call__(self, inputs: dict, history: list = [], temperature: float = 1.0):
         messages = self.construct_messages(inputs)
         updated_history = history + messages
         text = self.processor.apply_chat_template(updated_history, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(updated_history)
         model_inputs = self.processor(
             text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt'
         ).to(self.model.device)
         tokenizer = self.processor.tokenizer
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs = {
-            'max_new_tokens': 16000,
             'streamer': streamer,
             'stopping_criteria': StoppingCriteriaList([EosTokenCriteria(eos_token_id=self.model.config.eos_token_id)]),
             'pad_token_id': self.model.config.eos_token_id,
             **model_inputs
         }
-        thread = Thread(target=self.model.generate, kwargs=gen_kwargs)
         thread.start()
         partial_response = ""
         for new_text in streamer:
             partial_response += new_text
             yield partial_response, updated_history + [{
                 'role': 'assistant',
-                'content': [{
-                    'type': 'text',
-                    'text': partial_response
-                }]
             }]
     def _is_video_file(self, filename):
-        video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
-        return any(filename.lower().endswith(ext) for ext in video_extensions)
     def construct_messages(self, inputs: dict) -> list:
         content = []
-        for i, path in enumerate(inputs.get('files', [])):
             if self._is_video_file(path):
-                content.append({
-                    "type": "video",
-                    "video": f'file://{path}'
-                })
             else:
-                content.append({
-                    "type": "image",
-                    "image": f'file://{path}'
-                })
         query = inputs.get('text', '')
         if query:
-            content.append({
-                "type": "text",
-                "text": query,
-            })
-        messages = [{
-            "role": "user",
-            "content": content,
-        }]
-        return messages

+# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/infer.py
+import os
+import torch
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.generation.stopping_criteria import EosTokenCriteria, StoppingCriteriaList
 from qwen_vl_utils import process_vision_info
 class MiMoVLInfer:
+    def __init__(self, checkpoint_path, **kwargs):
+        dtype = torch.float16
         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            checkpoint_path,
+            torch_dtype=dtype,
+            device_map={"": "cpu"},
+            attn_implementation="eager",
+            trust_remote_code=True,
+        ).eval()
+        self.processor = AutoProcessor.from_pretrained(checkpoint_path, trust_remote_code=True)
+        self._on_cuda = False
+    def to_device(self, device: str):
+        if device == "cuda" and not self._on_cuda:
+            self.model.to("cuda")
+            self._on_cuda = True
+        elif device == "cpu" and self._on_cuda:
+            self.model.to("cpu")
+            self._on_cuda = False
     def __call__(self, inputs: dict, history: list = [], temperature: float = 1.0):
         messages = self.construct_messages(inputs)
         updated_history = history + messages
         text = self.processor.apply_chat_template(updated_history, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(updated_history)
         model_inputs = self.processor(
             text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt'
         ).to(self.model.device)
         tokenizer = self.processor.tokenizer
+        streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+        max_new = int(os.getenv("MAX_NEW_TOKENS", "1024"))
         gen_kwargs = {
+            'max_new_tokens': max_new,
+            'do_sample': True,
+            'temperature': max(0.0, float(temperature)),
+            'top_p': 0.95,
             'streamer': streamer,
             'stopping_criteria': StoppingCriteriaList([EosTokenCriteria(eos_token_id=self.model.config.eos_token_id)]),
             'pad_token_id': self.model.config.eos_token_id,
             **model_inputs
         }
+        thread = Thread(target=self.model.generate, kwargs=gen_kwargs, daemon=True)
         thread.start()
         partial_response = ""
         for new_text in streamer:
             partial_response += new_text
             yield partial_response, updated_history + [{
                 'role': 'assistant',
+                'content': [{'type': 'text', 'text': partial_response}]
             }]
     def _is_video_file(self, filename):
+        return any(filename.lower().endswith(ext) for ext in
+                   ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg'])
     def construct_messages(self, inputs: dict) -> list:
         content = []
+        for path in inputs.get('files', []):
             if self._is_video_file(path):
+                content.append({"type": "video", "video": f'file://{path}'})
             else:
+                content.append({"type": "image", "image": f'file://{path}'})
         query = inputs.get('text', '')
         if query:
+            content.append({"type": "text", "text": query})
+        return [{"role": "user", "content": content}]