File size: 3,757 Bytes
e856558
c818037
e856558
c8b3c1b
79837da
c818037
c8b3c1b
f03db3b
c818037
f03db3b
c818037
79837da
15f54ff
 
c8b3c1b
a9fcaee
c818037
f03db3b
c818037
c8b3c1b
 
79837da
e856558
c818037
f03db3b
97a1db9
c818037
 
a9fcaee
79837da
c8b3c1b
 
 
 
97a1db9
 
c818037
a9fcaee
 
 
 
c8b3c1b
a9fcaee
e856558
c818037
c8b3c1b
c818037
79837da
c818037
 
 
 
c8b3c1b
f03db3b
 
e856558
c8b3c1b
 
 
 
c818037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8b3c1b
f03db3b
c818037
79837da
 
 
 
 
 
c818037
 
 
e856558
c818037
 
79837da
f03db3b
a9fcaee
 
 
 
 
c8b3c1b
a9fcaee
 
79837da
a9fcaee
c818037
e856558
4f1ff55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
    PetBull‑7B‑VL demo – ZeroGPU‑ready (Qwen2.5‑VL API)
"""
import os
import spaces
import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from peft import PeftModel
from qwen_vl_utils import process_vision_info  # pip install qwen-vl-utils
import transformers, accelerate, numpy as np

print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

# ---- Config ----
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"  # your LoRA
ADAPTER_REV  = "master"
OFFLOAD_DIR  = "offload"
DTYPE        = torch.float16

# ---- Processor (no GPU) ----
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# ---- Base model ON CPU (do NOT touch CUDA here) ----
base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    BASE_MODEL,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True,
    device_map={"": "cpu"},
    offload_folder=OFFLOAD_DIR,
    trust_remote_code=True,
)

# ---- Attach LoRA ON CPU ----
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"},
).eval()

_model_on_gpu = False  # once-per-session move

# ---- Inference on GPU (ZeroGPU pattern) ----
@spaces.GPU(duration=120)
def generate_answer(image, question, temperature=0.7, top_p=0.95, max_tokens=256):
    """
    Uses Qwen2.5-VL chat template + qwen_vl_utils to prepare image+text, then generate.
    """
    global _model_on_gpu
    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    if not _model_on_gpu:
        model.to("cuda")
        _model_on_gpu = True

    # Build chat messages in Qwen format
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text": question or "Describe this image."},
        ],
    }]

    # Processor helpers
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    # Pack tensors on GPU
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = {k: (v.to("cuda") if hasattr(v, "to") else v) for k, v in inputs.items()}

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )

    # Trim prompt tokens before decode (Qwen style)
    trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
    return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# ---- UI ----
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU, Qwen2.5‑VL)") as demo:
    gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer)

demo.queue().launch(show_api=False)