Spaces:

ByteDance-Seed
/

Seed1.5-VL

Running

App Files Files Community

wondervictor commited on May 13

Commit

3ed4749

0 Parent(s):

add files

Browse files

Files changed (7) hide show

.gitattributes +44 -0
.gitignore +165 -0
README.md +14 -0
app.py +533 -0
infer.py +299 -0
requirements.txt +7 -0
visualizer.py +126 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,44 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/I7pTpMjqNRM_1080p_small.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/interface.jpg filter=lfs diff=lfs merge=lfs -text
+examples/newyork.jpg filter=lfs diff=lfs merge=lfs -text
+examples/puzzle.png filter=lfs diff=lfs merge=lfs -text
+examples/000000001000.jpeg filter=lfs diff=lfs merge=lfs -text
+examples/000000018380.jpeg filter=lfs diff=lfs merge=lfs -text
+examples/bancopy.jpg filter=lfs diff=lfs merge=lfs -text
+examples/beijing.jpg filter=lfs diff=lfs merge=lfs -text
+simhei.ttf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.DS_Store
+video_frames
+examples
+simhei.ttf

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Seed1.5 VL
+emoji: 🚀
+colorFrom: green
+colorTo: pink
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Seed1.5-VL API Demo
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,533 @@

+# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
+import os
+import re
+import cv2
+import json
+import time
+import numpy as np
+import gradio as gr
+from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
+from visualizer import draw_boxes_points_with_labels
+infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), api_key=os.getenv('API_KEY'))
+label_translations = {
+    "gr_chatinterface_ofl": {
+        "English": "Chatbot",
+        "中文": "对话界面"
+    },
+    "gr_chatinterface_ol": {
+        "English": "Chatbot",
+        "中文": "对话界面"
+    },
+    "gr_tab_ol": {
+        "English": "Online",
+        "中文": "在线模式"
+    },
+    "gr_tab_ofl": {
+        "English": "Offline",
+        "中文": "离线模式"
+    },
+    "gr_thinking": {
+        "English": ConversationModeI18N.D,
+        "中文": ConversationModeCN.D,
+    },
+    "gr_temperature": {
+        "English": "Temperature",
+        "中文": "温度系数"
+    },
+    "gr_webcam_image": {
+        "English": "🤳 Open Webcam",
+        "中文": "🤳 打开摄像头"
+    },
+    "gr_webcam_images": {
+        "English": "📹 Recorded Frames",
+        "中文": "📹 录制的视频帧"
+    },
+    "gr_chatinterface_ofl.textbox.placeholder": {
+        "English":
+        "Ask me anything. You can also drop in images and .mp4 videos.",
+        "中文": "有什么想问的？支持上传图片和.mp4视频。"
+    },
+    "gr_chatinterface_ol.textbox.placeholder": {
+        "English": "Ask me anything...",
+        "中文": "有什么想问的？"
+    },
+    "gr_clear_button": {
+        "English": "🧹 Clear History",
+        "中文": "🧹 清除历史对话"
+    }
+}
+def add_escape(text: str):
+    return text.replace('<', '\<').replace('>', '\>')
+def remove_escape(text: str):
+    return text.replace('\<', '<').replace('\>', '>')
+def plot_boxes_points_detections(image_path, message):
+    detection_pattern = r'\[\s*{.*?}\s*\]'
+    detection_matches = re.finditer(detection_pattern, message, flags=re.DOTALL)
+    bboxes, categories = [], []
+    for match in detection_matches:
+        matched_str = match.group(0)
+        detections = json.loads(matched_str)
+        for detection in detections:
+            cat, bbox_str = detection['category'], detection['bbox']
+            bbox_str = bbox_str.replace('<bbox>', '').replace('</bbox>', '').replace('</bbox', '')
+            bbox = list(map(float, bbox_str.split(' ')))
+            bboxes.append(bbox)
+            categories.append(cat)
+    if not bboxes:
+        box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>'
+        box_matches = re.finditer(box_pattern, message)
+        bboxes = [
+            [float(match.group(1)), float(match.group(2)),
+            float(match.group(3)), float(match.group(4))]
+            for match in box_matches
+        ]
+    points = []
+    if not bboxes:
+        point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>'
+        point_matches = re.finditer(point_pattern, message)
+        points = [
+            [float(match.group(1)), float(match.group(2))]
+            for match in point_matches
+        ]
+    if not bboxes and not points:
+        return
+    bboxes = np.array(bboxes, dtype='float') / 1000
+    points = np.array(points, dtype='float') / 1000
+    image = cv2.imread(image_path)
+    h, w, c = image.shape
+    if bboxes.size:
+        bboxes[:, 0::2] *= w
+        bboxes[:, 1::2] *= h
+    if points.size:
+        points[:, 0] *= w
+        points[:, 1] *= h
+    output_image = draw_boxes_points_with_labels(image, bboxes, points, categories)
+    return output_image
+def general_chat(inputs: dict, gr_history: list, infer_history: list,
+                 if_thinking: bool, temperature: float, online: bool = False):
+    if 'text' in inputs:
+        inputs['text'] = remove_escape(inputs['text'])
+    mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
+    for response_text, infer_history, finished in infer(inputs=inputs,
+                                              history=infer_history,
+                                              mode=mode,
+                                              temperature=temperature,
+                                              online=online):
+        if if_thinking:
+            reasoning_text, response_text = response_text.split('</think>')
+            reasoning_text = reasoning_text.lstrip('<think>')
+            response_message = [{
+                "role": "assistant",
+                "content": add_escape(reasoning_text),
+                'metadata': {
+                    'title': '🤔 Thinking'
+                }
+            }, {
+                "role": "assistant",
+                "content": add_escape(response_text)
+            }]
+        else:
+            response_message = [{
+                "role": "assistant",
+                "content": add_escape(response_text)
+            }]
+        if finished and len(inputs.get('files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
+            image_path = inputs['files'][0]
+            response_text = infer_history[-1]['content']
+            try:
+                if if_thinking:
+                    reasoning_text, response_text = response_text.split('</think>')
+                output_image = plot_boxes_points_detections(image_path, response_text)
+                if output_image is not None:
+                    response_message.append({
+                        "role": "assistant",
+                        "content": gr.Image(output_image),
+                    })
+            except Exception as e:
+                print(e)
+        yield response_message, infer_history
+def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
+                       gr_counter: int, infer_history: list, if_thinking: bool,
+                       temperature: float):
+    if not gr_webcam_images:
+        gr_webcam_images = []
+    gr_webcam_images = gr_webcam_images[gr_counter:]
+    inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
+    yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
+        gr_webcam_images), infer_history
+    for response_message, infer_history in general_chat(
+            inputs, gr_history, infer_history, if_thinking, temperature, online=True):
+        yield response_message, gr.skip(), infer_history
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr_title = gr.Markdown('# Seed1.5-VL')
+        with gr.Row():
+            gr.Markdown(
+                """
+                <div style="display:flex; flex-direction:column; gap:10px;">
+                <a
+                    href="https://github.com/ByteDance-Seed/Seed1.5-VL"
+                    target="_blank"
+                    style="
+                    display: inline-flex;
+                    align-items: center;
+                    gap: 8px;
+                    white-space: nowrap;
+                    text-decoration: none;
+                    "
+                >
+                    <img
+                    src="https://cdn.jsdelivr.net/gh/devicons/devicon/icons/github/github-original.svg"
+                    alt="GitHub"
+                    width="24"
+                    >
+                    Seed1.5-VL Cookbook
+                </a>
+                </div>
+                """
+            )
+            gr.Markdown(
+                """
+                <div style="display:flex; flex-direction:column; gap:10px;">
+                <a
+                    href="https://huggingface.co/papers/2505.07062"
+                    target="_blank"
+                    style="
+                    display: inline-flex;
+                    align-items: center;
+                    gap: 8px;
+                    white-space: nowrap;
+                    text-decoration: none;
+                    "
+                >
+                    <img
+                    src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
+                    alt="Paper"
+                    width="24"
+                    >
+                    Seed1.5-VL Paper
+                </a>
+                </div>
+                """,
+            )
+            gr.Markdown(' ')
+            gr.Markdown(' ')
+            gr.Markdown(' ')
+            gr.Markdown(' ')
+        with gr.Row():
+            gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
+                                           value="English",
+                                           label="🌐 English Interface/中文界面",
+                                           interactive=True,
+                                           min_width=400,
+                                           scale=0)
+    with gr.Tabs():
+        with gr.Tab("Offline") as gr_tab_ofl:
+            gr_infer_history = gr.State([])
+            gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
+            gr_temperature_hidden = gr.Slider(minimum=0.0,
+                                              maximum=2.0,
+                                              step=0.1,
+                                              value=0.0,
+                                              interactive=True,
+                                              visible=False)
+            gr_chatinterface_ofl = gr.ChatInterface(
+                fn=general_chat,
+                type="messages",
+                multimodal=True,
+                textbox=gr.MultimodalTextbox(
+                    file_count="multiple",
+                    file_types=["image", ".mp4"],
+                    sources=["upload"],
+                    stop_btn=True,
+                    placeholder=label_translations[
+                        'gr_chatinterface_ofl.textbox.placeholder']['English'],
+                ),
+                additional_inputs=[
+                    gr_infer_history, gr_thinking_hidden, gr_temperature_hidden
+                ],
+                additional_outputs=[gr_infer_history],
+            )
+            def add_escape_fn(inputs: dict):
+                if inputs and 'text' in inputs:
+                    inputs['text'] = add_escape(inputs['text'])
+                return inputs
+            gr_chatinterface_ofl.textbox.submit(
+                fn=add_escape_fn,
+                inputs=[gr_chatinterface_ofl.saved_input],
+                outputs=[gr_chatinterface_ofl.saved_input]
+            )
+            gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
+                  fn=lambda: [],
+                  outputs=[gr_infer_history])
+            with gr.Row():
+                gr_thinking_ofl = gr.Checkbox(
+                    value=True,
+                    label=label_translations['gr_thinking']['English'],
+                )
+                gr_thinking_ofl.change(lambda x: x,
+                                        inputs=gr_thinking_ofl,
+                                        outputs=gr_thinking_hidden)
+                gr_temperature_ofl = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    step=0.1,
+                    value=0.0,
+                    label=label_translations['gr_temperature']['English'],
+                    interactive=True)
+                gr_temperature_ofl.change(lambda x: x,
+                                            inputs=gr_temperature_ofl,
+                                            outputs=gr_temperature_hidden)
+                gr_clear_button_ofl = gr.Button(value=label_translations['gr_clear_button']['English'])
+                def clear_history_fn():
+                    return None, [], [], [], []
+                gr_clear_button_ofl.click(
+                    fn=clear_history_fn,
+                    outputs=[
+                        gr_chatinterface_ofl.conversation_id,
+                        gr_chatinterface_ofl.saved_conversations,
+                        gr_chatinterface_ofl.chatbot,
+                        gr_chatinterface_ofl.chatbot_state,
+                        gr_infer_history
+                    ]
+                )
+            with gr.Column(visible=True) as gr_examples_en:
+                gr.Examples(
+                    label='7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
+                    examples=[
+                        {
+                            "text": "Who are you?",
+                            "files": []
+                        },
+                        {
+                            "text": "Introduce this.",
+                            "files": ["examples/bancopy.jpg"]
+                        },
+                        {
+                            "text":
+                            """Find Curry's "Good Night" celebration time.""",
+                            "files":
+                            ["examples/I7pTpMjqNRM_1080p_small.mp4"]
+                        },
+                        {
+                            "text":
+                            "Share your feelings.",
+                            "files": [
+                                "examples/newyork.jpg",
+                                "examples/beijing.jpg"
+                            ]
+                        },
+                        {
+                            "text": "Look and answer.",
+                            "files": ["examples/puzzle.png"]
+                        },
+                        {
+                            "text": "Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>",
+                            "files": ["examples/000000001000.jpeg"]
+                        },
+                        {
+                            "text": """Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
+                            "files": ["examples/000000018380.jpeg"]
+                        }
+                    ],
+                    inputs=[gr_chatinterface_ofl.textbox],
+                )
+            with gr.Column(visible=False) as gr_examples_cn:
+                gr.Examples(
+                    label='七个示例：文本，图像，视频，多个图像/视频，视觉解谜，坐标定位，开放式物体检测。',
+                    examples=[
+                        {
+                            "text": "你是谁？",
+                            "files": []
+                        },
+                        {
+                            "text": "介绍一下。",
+                            "files": ["examples/bancopy.jpg"]
+                        },
+                        {
+                            "text":
+                            "找到库里的“晚安”庆祝时间段。",
+                            "files":
+                            ["examples/I7pTpMjqNRM_1080p_small.mp4"]
+                        },
+                        {
+                            "text":
+                            "你有什么感想？",
+                            "files": [
+                                "examples/newyork.jpg",
+                                "examples/beijing.jpg"
+                            ]
+                        },
+                        {
+                            "text": "看图回答。",
+                            "files": ["examples/puzzle.png"]
+                        },
+                        {
+                            "text": "请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>",
+                            "files": ["examples/000000001000.jpeg"]
+                        },
+                        {
+                            "text": """请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表，就像：[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
+                            "files": ["examples/000000018380.jpeg"]
+                        }
+                    ],
+                    inputs=[gr_chatinterface_ofl.textbox],
+                )
+        with gr.Tab("Online") as gr_tab_ol:
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr_infer_history_ol = gr.State([])
+                    gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
+                    gr_temperature_hidden = gr.Slider(minimum=0.0,
+                                                      maximum=2.0,
+                                                      step=0.1,
+                                                      value=1.0,
+                                                      interactive=True,
+                                                      visible=False)
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            gr_webcam_image = gr.Image(
+                                label=label_translations['gr_webcam_image']
+                                ['English'],
+                                sources="webcam",
+                                height=250,
+                                type='filepath')
+                            gr_webcam_images = gr.Gallery(
+                                label=label_translations['gr_webcam_images']
+                                ['English'],
+                                show_label=True,
+                                format='webp',
+                                columns=1,
+                                height=250,
+                                preview=True,
+                                interactive=False)
+                            gr_counter = gr.Number(value=0, visible=False)
+                        with gr.Column(scale=3):
+                            gr_chatinterface_ol = gr.ChatInterface(
+                                fn=online_record_chat,
+                                type="messages",
+                                multimodal=False,
+                                textbox=gr.
+                                Textbox(placeholder=label_translations[
+                                    'gr_chatinterface_ol.textbox.placeholder']
+                                        ['English'],
+                                        submit_btn=True,
+                                        stop_btn=True),
+                                additional_inputs=[
+                                    gr_webcam_images, gr_counter,
+                                    gr_infer_history_ol, gr_thinking_hidden,
+                                    gr_temperature_hidden
+                                ],
+                                additional_outputs=[
+                                    gr_counter, gr_infer_history_ol
+                                ],
+                            )
+                            def cache_webcam(recorded_image: str,
+                                             recorded_images: list):
+                                if not recorded_images:
+                                    recorded_images = []
+                                return recorded_images + [recorded_image]
+                            gr_webcam_image.stream(
+                                fn=cache_webcam,
+                                inputs=[gr_webcam_image, gr_webcam_images],
+                                outputs=[gr_webcam_images],
+                                stream_every=1,
+                                concurrency_limit=30,
+                            )
+                            with gr.Row():
+                                gr_thinking_ol = gr.Checkbox(
+                                    value=True,
+                                    label=label_translations['gr_thinking']
+                                    ['English'],
+                                )
+                                gr_thinking_ol.change(
+                                    lambda x: x,
+                                    inputs=gr_thinking_ol,
+                                    outputs=gr_thinking_hidden)
+                                gr_temperature_ol = gr.Slider(
+                                    minimum=0.0,
+                                    maximum=2.0,
+                                    step=0.1,
+                                    value=1.0,
+                                    label=label_translations['gr_temperature']
+                                    ['English'],
+                                    interactive=True)
+                                gr_temperature_ol.change(
+                                    lambda x: x,
+                                    inputs=gr_temperature_ol,
+                                    outputs=gr_temperature_hidden)
+                                gr_clear_button_ol = gr.Button(value=label_translations['gr_clear_button']['English'])
+                                def clear_history_fn():
+                                    return None, [], [], [], []
+                                gr_clear_button_ol.click(
+                                    fn=clear_history_fn,
+                                    outputs=[
+                                        gr_chatinterface_ol.conversation_id,
+                                        gr_chatinterface_ol.saved_conversations,
+                                        gr_chatinterface_ol.chatbot,
+                                        gr_chatinterface_ol.chatbot_state,
+                                        gr_infer_history_ol
+                                    ]
+                                )
+    def update_lang(lang: str):
+        return (
+            gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
+            gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
+            gr.update(placeholder=label_translations[
+                'gr_chatinterface_ofl.textbox.placeholder'][lang]),
+            gr.update(placeholder=label_translations[
+                'gr_chatinterface_ol.textbox.placeholder'][lang]),
+            gr.update(label=label_translations['gr_tab_ofl'][lang]),
+            gr.update(label=label_translations['gr_tab_ol'][lang]),
+            gr.update(label=label_translations['gr_thinking'][lang]),
+            gr.update(label=label_translations['gr_thinking'][lang]),
+            gr.update(label=label_translations['gr_temperature'][lang]),
+            gr.update(label=label_translations['gr_temperature'][lang]),
+            gr.update(visible=lang == 'English'),
+            gr.update(visible=lang != 'English'),
+            gr.update(label=label_translations['gr_webcam_image'][lang]),
+            gr.update(label=label_translations['gr_webcam_images'][lang]),
+            gr.update(value=label_translations['gr_clear_button'][lang]),
+            gr.update(value=label_translations['gr_clear_button'][lang]),
+        )
+    gr_lang_selector.change(fn=update_lang,
+                            inputs=[gr_lang_selector],
+                            outputs=[
+                                gr_chatinterface_ofl.chatbot,
+                                gr_chatinterface_ol.chatbot,
+                                gr_chatinterface_ofl.textbox,
+                                gr_chatinterface_ol.textbox,
+                                gr_tab_ofl,
+                                gr_tab_ol,
+                                gr_thinking_ofl,
+                                gr_thinking_ol,
+                                gr_temperature_ofl,
+                                gr_temperature_ol,
+                                gr_examples_en,
+                                gr_examples_cn,
+                                gr_webcam_image,
+                                gr_webcam_images,
+                                gr_clear_button_ofl,
+                                gr_clear_button_ol,
+                            ])
+demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True,
+                                                               max_threads=100,
+                                                               ssr_mode=False)

infer.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
+import json
+import time
+import math
+import base64
+import requests
+import torch
+import decord
+import numpy as np
+from PIL import Image, ImageSequence
+from torchvision.io import read_image, encode_jpeg
+from torchvision.transforms.functional import resize
+from torchvision.transforms import InterpolationMode
+class ConversationModeI18N:
+    G = "General"
+    D = "Deep Thinking"
+class ConversationModeCN:
+    G = "常规"
+    D = "深度思考"
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def get_resized_hw_for_Navit(
+    height: int,
+    width: int,
+    min_pixels: int,
+    max_pixels: int,
+    max_ratio: int = 200,
+    factor: int = 28,
+):
+    if max(height, width) / min(height, width) > max_ratio:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return int(h_bar), int(w_bar)
+class SeedVLInfer:
+    def __init__(
+        self,
+        model_id: str,
+        api_key: str,
+        base_url: str = 'https://ark.cn-beijing.volces.com/api/v3/chat/completions',
+        min_pixels: int = 4 * 28 * 28,
+        max_pixels: int = 5120 * 28 * 28,
+        video_sampling_strategy: dict = {
+            'sampling_fps':
+            1,
+            'min_n_frames':
+            16,
+            'max_video_length':
+            81920,
+            'max_pixels_choices': [
+                640 * 28 * 28, 512 * 28 * 28, 384 * 28 * 28, 256 * 28 * 28,
+                160 * 28 * 28, 128 * 28 * 28
+            ],
+            'use_timestamp':
+            True,
+        },
+    ):
+        self.base_url = base_url
+        self.api_key = api_key
+        self.model_id = model_id
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.sampling_fps = video_sampling_strategy.get('sampling_fps', 1)
+        self.min_n_frames = video_sampling_strategy.get('min_n_frames', 16)
+        self.max_video_length = video_sampling_strategy.get(
+            'max_video_length', 81920)
+        self.max_pixels_choices = video_sampling_strategy.get(
+            'max_pixels_choices', [
+                640 * 28 * 28, 512 * 28 * 28, 384 * 28 * 28, 256 * 28 * 28,
+                160 * 28 * 28, 128 * 28 * 28
+            ])
+        self.use_timestamp = video_sampling_strategy.get('use_timestamp', True)
+    def preprocess_video(self, video_path: str):
+        try:
+            video_reader = decord.VideoReader(video_path, num_threads=2)
+            fps = video_reader.get_avg_fps()
+        except decord._ffi.base.DECORDError:
+            video_reader = [
+                frame.convert('RGB')
+                for frame in ImageSequence.Iterator(Image.open(video_path))
+            ]
+            fps = 1
+        length = len(video_reader)
+        n_frames = min(
+            max(math.ceil(length / fps * self.sampling_fps),
+                self.min_n_frames), length)
+        frame_indices = np.linspace(0, length - 1,
+                                    n_frames).round().astype(int).tolist()
+        max_pixels = self.max_pixels
+        for round_idx, max_pixels in enumerate(self.max_pixels_choices):
+            is_last_round = round_idx == len(self.max_pixels_choices) - 1
+            if len(frame_indices
+                   ) * max_pixels / 28 / 28 > self.max_video_length:
+                if is_last_round:
+                    max_frame_num = int(self.max_video_length / max_pixels *
+                                        28 * 28)
+                    select_ids = np.linspace(
+                        0,
+                        len(frame_indices) - 1,
+                        max_frame_num).round().astype(int).tolist()
+                    frame_indices = [
+                        frame_indices[select_id] for select_id in select_ids
+                    ]
+                else:
+                    continue
+            else:
+                break
+        if hasattr(video_reader, "get_batch"):
+            video_clip = torch.from_numpy(
+                video_reader.get_batch(frame_indices).asnumpy()).permute(
+                    0, 3, 1, 2)
+        else:
+            video_clip_array = torch.stack(
+                [np.array(video_reader[i]) for i in frame_indices], dim=0)
+            video_clip = torch.from_numpy(video_clip_array).permute(0, 3, 1, 2)
+        height, width = video_clip.shape[-2:]
+        resized_height, resized_width = get_resized_hw_for_Navit(
+            height,
+            width,
+            min_pixels=self.min_pixels,
+            max_pixels=max_pixels,
+        )
+        resized_video_clip = resize(video_clip,
+                                    (resized_height, resized_width),
+                                    interpolation=InterpolationMode.BICUBIC,
+                                    antialias=True)
+        if self.use_timestamp:
+            resized_video_clip = [
+                (round(i / fps, 1), f)
+                for i, f in zip(frame_indices, resized_video_clip)
+            ]
+        return resized_video_clip
+    def preprocess_streaming_frame(self, frame: torch.Tensor):
+        height, width = frame.shape[-2:]
+        resized_height, resized_width = get_resized_hw_for_Navit(
+            height,
+            width,
+            min_pixels=self.min_pixels,
+            max_pixels=self.max_pixels_choices[0],
+        )
+        resized_frame = resize(frame[None], (resized_height, resized_width),
+                               interpolation=InterpolationMode.BICUBIC,
+                               antialias=True)[0]
+        return resized_frame
+    def encode_image(self, image: torch.Tensor) -> str:
+        if image.shape[0] == 4:
+            image = image[:3]
+        encoded = encode_jpeg(image)
+        return base64.b64encode(encoded.numpy()).decode('utf-8')
+    def construct_messages(self,
+                           inputs: dict,
+                           streaming_timestamp: int = None,
+                           online: bool = False) -> list[dict]:
+        content = []
+        for i, path in enumerate(inputs.get('files', [])):
+            if path.endswith('.mp4'):
+                video = self.preprocess_video(video_path=path)
+                for frame in video:
+                    if self.use_timestamp:
+                        timestamp, frame = frame
+                        content.append({
+                            "type": "text",
+                            "text": f'[{timestamp} second]',
+                        })
+                    content.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url":
+                            f"data:image/jpeg;base64,{self.encode_image(frame)}",
+                            "detail": "high"
+                        },
+                    })
+            else:
+                image = read_image(path)
+                if online and path.endswith('.webp'):
+                    streaming_timestamp = i
+                if streaming_timestamp is not None:
+                    image = self.preprocess_streaming_frame(frame=image)
+                content.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "url":
+                        f"data:image/jpeg;base64,{self.encode_image(image)}",
+                        "detail": "high"
+                    },
+                })
+                if streaming_timestamp is not None:
+                    content.insert(-1, {
+                        "type": "text",
+                        "text": f'[{streaming_timestamp} second]',
+                    })
+        query = inputs.get('text', '')
+        if query:
+            content.append({
+                "type": "text",
+                "text": query,
+            })
+        messages = [{
+            "role": "user",
+            "content": content,
+        }]
+        return messages
+    def request(self,
+                messages,
+                thinking: bool = True,
+                temperature: float = 1.0):
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": self.model_id,
+            "messages": messages,
+            "stream": True,
+            "thinking": {
+                "type": "enabled" if thinking else "disabled",
+            },
+            "temperature": temperature,
+        }
+        for _ in range(10):
+            try:
+                requested = requests.post(self.base_url,
+                                          headers=headers,
+                                          json=payload,
+                                          stream=True,
+                                          timeout=600)
+                break
+            except Exception as e:
+                time.sleep(0.1)
+                print(e)
+        content, reasoning_content = '', ''
+        for line in requested.iter_lines():
+            if not line:
+                continue
+            if line.startswith(b'data:'):
+                data = line[len("data: "):]
+                if data == b"[DONE]":
+                    yield content, reasoning_content, True
+                    break
+                delta = json.loads(data)['choices'][0]['delta']
+                content += delta['content']
+                reasoning_content += delta.get('reasoning_content', '')
+                yield content, reasoning_content, False
+    def __call__(self,
+                 inputs: dict,
+                 history: list[dict] = [],
+                 mode: str = ConversationModeI18N.D,
+                 temperature: float = 1.0,
+                 online: bool = False):
+        messages = self.construct_messages(inputs=inputs, online=online)
+        updated_history = history + messages
+        for response, reasoning, finished in self.request(
+            messages=updated_history,
+            thinking=mode == ConversationModeI18N.D,
+            temperature=temperature):
+            if mode == ConversationModeI18N.D:
+                response = '<think>' + reasoning + '</think>' + response
+            yield response, updated_history + [{'role': 'assistant', 'content': response}], finished

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+supervision==0.25.1
+openai==1.76.0
+opencv-python==4.10.0.84
+numpy==1.26.2
+pillow==11.0.0
+matplotlib==3.10.
+decord==0.6.0

visualizer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import cv2
+import numpy as np
+import supervision as sv
+from PIL import Image, ImageDraw, ImageFont
+from supervision.annotators.utils import resolve_color
+# visualization toos based on supervision
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=2)
+class LabelAnnotator(sv.LabelAnnotator):
+    @staticmethod
+    def resolve_text_background_xyxy(
+        center_coordinates,
+        text_wh,
+        position,
+    ):
+        center_x, center_y = center_coordinates
+        text_w, text_h = text_wh
+        return center_x, center_y, center_x + text_w, center_y + text_h
+    def _draw_labels(
+        self,
+        scene: np.ndarray,
+        labels: list[str],
+        label_properties: np.ndarray,
+        detections,
+        custom_color_lookup,
+    ) -> None:
+        assert len(labels) == len(label_properties) == len(detections), (
+            f"Number of label properties ({len(label_properties)}), "
+            f"labels ({len(labels)}) and detections ({len(detections)}) "
+            "do not match."
+        )
+        color_lookup = (
+            custom_color_lookup
+            if custom_color_lookup is not None
+            else self.color_lookup
+        )
+        font = ImageFont.truetype("simhei.ttf", int(30 * self.text_scale))
+        for idx, label_property in enumerate(label_properties):
+            background_color = resolve_color(
+                color=self.color,
+                detections=detections,
+                detection_idx=idx,
+                color_lookup=color_lookup,
+            )
+            text_color = resolve_color(
+                color=self.text_color,
+                detections=detections,
+                detection_idx=idx,
+                color_lookup=color_lookup,
+            )
+            box_xyxy = label_property[:4]
+            text_height_padded = label_property[4]
+            self.draw_rounded_rectangle(
+                scene=scene,
+                xyxy=box_xyxy,
+                color=background_color.as_bgr(),
+                border_radius=self.border_radius,
+            )
+            text_x = box_xyxy[0] + self.text_padding
+            text_y = box_xyxy[1]
+            scene_pil = Image.fromarray(cv2.cvtColor(scene, cv2.COLOR_BGR2RGB))
+            draw = ImageDraw.Draw(scene_pil)
+            draw.text(
+                (text_x, text_y),
+                labels[idx],
+                font=font,
+                fill=(text_color.r, text_color.g, text_color.b),
+            )
+            scene[:] = cv2.cvtColor(np.array(scene_pil), cv2.COLOR_RGB2BGR)
+LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
+                                 text_scale=0.5,
+                                 text_thickness=1)
+POINT_ANNOTATOR = sv.DotAnnotator(radius=6)
+def draw_boxes_points_with_labels(
+    cv2_image,
+    boxes=None,
+    points=None,
+    classes=None,
+    output_path=None,
+):
+    annotated_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
+    if boxes is not None and boxes.size:
+        detections = sv.Detections(
+            xyxy=boxes,
+            class_id=np.arange(len(boxes)),
+            confidence=np.ones(len(boxes))
+        )
+        annotated_image = BOUNDING_BOX_ANNOTATOR.annotate(
+            annotated_image, detections)
+    if points is not None and points.size:
+        points = np.concatenate([points, points], axis=1)
+        detections = sv.Detections(
+            xyxy=points,
+            class_id=np.arange(len(points)),
+            confidence=np.ones(len(points))
+        )
+        annotated_image = POINT_ANNOTATOR.annotate(
+            annotated_image, detections,
+        )
+    if classes:
+        annotated_image = LABEL_ANNOTATOR.annotate(
+            annotated_image, detections, labels=classes
+        )
+    if output_path:
+        cv2.imwrite(
+            output_path,
+            cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
+        )
+    return annotated_image