Spaces:

warshanks
/

medgemma-4b-it

Running on Zero

App Files Files Community

warshanks commited on 17 days ago

Commit

b60fb62

1 Parent(s): cb74b60

Init

Browse files

Files changed (5) hide show

README.md +10 -6
app.py +210 -44
requirements.txt +251 -1
style.css +11 -0
uv.lock +0 -0

README.md CHANGED Viewed

@@ -1,12 +1,16 @@
 ---
-title: Medgemma 4b
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 5.0.1
 app_file: app.py
 pinned: false
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 ---
+title: MedGemma 4B IT
+models: [google/medgemma-4b-it]
+preload_from_hub: google/medgemma-4b-it
+emoji: 🩻
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.21.0
 app_file: app.py
 pinned: false
+thumbnail: >-
+  https://cdn-uploads.huggingface.co/production/uploads/67340377534ff3213928481b/f2kd9Zs0G-chH0ZwfDSOT.png
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,64 +1,230 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
         messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 demo = gr.ChatInterface(
-    respond,
     additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
 if __name__ == "__main__":
     demo.launch()

+#!/usr/bin/env python
+import os
+import re
+import tempfile
+from collections.abc import Iterator
+from threading import Thread
+import cv2
 import gradio as gr
+import spaces
+import torch
+from loguru import logger
+from PIL import Image
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+model_id = os.getenv("MODEL_ID", "google/medgemma-4b-it")
+processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    model_id, device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="eager"
+)
+MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
+def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
+    image_count = 0
+    video_count = 0
+    for path in paths:
+        if path.endswith(".mp4"):
+            video_count += 1
+        else:
+            image_count += 1
+    return image_count, video_count
+def count_files_in_history(history: list[dict]) -> tuple[int, int]:
+    image_count = 0
+    video_count = 0
+    for item in history:
+        if item["role"] != "user" or isinstance(item["content"], str):
+            continue
+        if item["content"][0].endswith(".mp4"):
+            video_count += 1
+        else:
+            image_count += 1
+    return image_count, video_count
+def validate_media_constraints(message: dict, history: list[dict]) -> bool:
+    new_image_count, new_video_count = count_files_in_new_message(message["files"])
+    history_image_count, history_video_count = count_files_in_history(history)
+    image_count = history_image_count + new_image_count
+    video_count = history_video_count + new_video_count
+    if video_count > 1:
+        gr.Warning("Only one video is supported.")
+        return False
+    if video_count == 1:
+        if image_count > 0:
+            gr.Warning("Mixing images and videos is not allowed.")
+            return False
+        if "<image>" in message["text"]:
+            gr.Warning("Using <image> tags with video files is not supported.")
+            return False
+    if video_count == 0 and image_count > MAX_NUM_IMAGES:
+        gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
+        return False
+    if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
+        gr.Warning("The number of <image> tags in the text does not match the number of images.")
+        return False
+    return True
+def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
+    vidcap = cv2.VideoCapture(video_path)
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_interval = max(total_frames // MAX_NUM_IMAGES, 1)
+    frames: list[tuple[Image.Image, float]] = []
+    for i in range(0, min(total_frames, MAX_NUM_IMAGES * frame_interval), frame_interval):
+        if len(frames) >= MAX_NUM_IMAGES:
+            break
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+def process_video(video_path: str) -> list[dict]:
+    content = []
+    frames = downsample_video(video_path)
+    for frame in frames:
+        pil_image, timestamp = frame
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
+            pil_image.save(temp_file.name)
+            content.append({"type": "text", "text": f"Frame {timestamp}:"})
+            content.append({"type": "image", "url": temp_file.name})
+    logger.debug(f"{content=}")
+    return content
+def process_interleaved_images(message: dict) -> list[dict]:
+    logger.debug(f"{message['files']=}")
+    parts = re.split(r"(<image>)", message["text"])
+    logger.debug(f"{parts=}")
+    content = []
+    image_index = 0
+    for part in parts:
+        logger.debug(f"{part=}")
+        if part == "<image>":
+            content.append({"type": "image", "url": message["files"][image_index]})
+            logger.debug(f"file: {message['files'][image_index]}")
+            image_index += 1
+        elif part.strip():
+            content.append({"type": "text", "text": part.strip()})
+        elif isinstance(part, str) and part != "<image>":
+            content.append({"type": "text", "text": part})
+    logger.debug(f"{content=}")
+    return content
+def process_new_user_message(message: dict) -> list[dict]:
+    if not message["files"]:
+        return [{"type": "text", "text": message["text"]}]
+    if message["files"][0].endswith(".mp4"):
+        return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
+    if "<image>" in message["text"]:
+        return process_interleaved_images(message)
+    return [
+        {"type": "text", "text": message["text"]},
+        *[{"type": "image", "url": path} for path in message["files"]],
+    ]
+def process_history(history: list[dict]) -> list[dict]:
+    messages = []
+    current_user_content: list[dict] = []
+    for item in history:
+        if item["role"] == "assistant":
+            if current_user_content:
+                messages.append({"role": "user", "content": current_user_content})
+                current_user_content = []
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
+        else:
+            content = item["content"]
+            if isinstance(content, str):
+                current_user_content.append({"type": "text", "text": content})
+            else:
+                current_user_content.append({"type": "image", "url": content[0]})
+    return messages
+@spaces.GPU(duration=120)
+def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 2048) -> Iterator[str]:
+    if not validate_media_constraints(message, history):
+        yield ""
+        return
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
+    messages.extend(process_history(history))
+    messages.append({"role": "user", "content": process_new_user_message(message)})
+    inputs = processor.apply_chat_template(
         messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(device=model.device, dtype=torch.bfloat16)
+    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        inputs,
+        max_new_tokens=max_new_tokens,
+        streamer=streamer,
+        temperature=1.0,
+        top_p=0.95,
+        top_k=64,
+        min_p=0.0,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    output = ""
+    for delta in streamer:
+        output += delta
+        yield output
+DESCRIPTION = """\
+This is a demo of MedGemma, a Gemma 3 variant trained for performance on medical text and image comprehension.
+You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
 """
 demo = gr.ChatInterface(
+    fn=run,
+    type="messages",
+    chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
+    textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
+    multimodal=True,
     additional_inputs=[
+        gr.Textbox(label="System Prompt", value=""),
+        gr.Slider(label="Max New Tokens", minimum=100, maximum=8192, step=10, value=2048),
     ],
+    stop_btn=False,
+    title="MedGemma 4B IT",
+    description=DESCRIPTION,
+    run_examples_on_click=False,
+    cache_examples=False,
+    css_paths="style.css",
+    delete_cache=(1800, 1800),
 )
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,251 @@
1	- ~~huggingface_hub==0.25.2~~

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+accelerate==1.4.0
+    # via gemma-3-12b-it (pyproject.toml)
+aiofiles==23.2.1
+    # via gradio
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.8.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+certifi==2025.1.31
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.1
+    # via requests
+click==8.1.8
+    # via
+    #   typer
+    #   uvicorn
+exceptiongroup==1.2.2
+    # via anyio
+fastapi==0.115.11
+    # via gradio
+ffmpy==0.5.0
+    # via gradio
+filelock==3.17.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+fsspec==2025.3.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==5.21.0
+    # via
+    #   gemma-3-12b-it (pyproject.toml)
+    #   spaces
+gradio-client==1.7.2
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-transfer==0.1.9
+    # via gemma-3-12b-it (pyproject.toml)
+httpcore==1.0.7
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   safehttpx
+    #   spaces
+huggingface-hub==0.29.2
+    # via
+    #   accelerate
+    #   gradio
+    #   gradio-client
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+loguru==0.7.3
+    # via gemma-3-12b-it (pyproject.toml)
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+networkx==3.4.2
+    # via torch
+numpy==2.2.3
+    # via
+    #   accelerate
+    #   gradio
+    #   opencv-python-headless
+    #   pandas
+    #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+opencv-python-headless==4.11.0.86
+    # via gemma-3-12b-it (pyproject.toml)
+orjson==3.10.15
+    # via gradio
+packaging==24.2
+    # via
+    #   accelerate
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   spaces
+    #   transformers
+pandas==2.2.3
+    # via gradio
+pillow==11.1.0
+    # via gradio
+protobuf==6.30.0
+    # via gemma-3-12b-it (pyproject.toml)
+psutil==5.9.8
+    # via
+    #   accelerate
+    #   spaces
+pydantic==2.10.6
+    # via
+    #   fastapi
+    #   gradio
+    #   spaces
+pydantic-core==2.27.2
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.1
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-multipart==0.0.20
+    # via gradio
+pytz==2025.1
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+regex==2024.11.6
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   spaces
+    #   transformers
+rich==13.9.4
+    # via typer
+ruff==0.9.10
+    # via gradio
+safehttpx==0.1.6
+    # via gradio
+safetensors==0.5.3
+    # via
+    #   accelerate
+    #   transformers
+semantic-version==2.10.0
+    # via gradio
+sentencepiece==0.2.0
+    # via gemma-3-12b-it (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+spaces==0.32.0
+    # via gemma-3-12b-it (pyproject.toml)
+starlette==0.46.1
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.13.3
+    # via torch
+tokenizers==0.21.0
+    # via transformers
+tomlkit==0.13.2
+    # via gradio
+torch==2.4.0
+    # via
+    #   gemma-3-12b-it (pyproject.toml)
+    #   accelerate
+tqdm==4.67.1
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers @ git+https://github.com/huggingface/transformers@2829013d2d00e63d75a1f6f7a3f003bc60cc69af
+    # via gemma-3-12b-it (pyproject.toml)
+triton==3.0.0
+    # via torch
+typer==0.15.2
+    # via gradio
+typing-extensions==4.12.2
+    # via
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   rich
+    #   spaces
+    #   torch
+    #   typer
+    #   uvicorn
+tzdata==2025.1
+    # via pandas
+urllib3==2.3.0
+    # via requests
+uvicorn==0.34.0
+    # via gradio
+websockets==15.0.1
+    # via gradio-client

style.css ADDED Viewed

	@@ -0,0 +1,11 @@

+h1 {
+    text-align: center;
+    display: block;
+  }
+  #logo {
+    display: block;
+    margin: 0 auto;
+    width: 40%;
+    object-fit: contain;
+  }

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff