File size: 3,221 Bytes
e856558
c8b3c1b
e856558
c8b3c1b
 
 
 
f03db3b
 
 
e856558
c8b3c1b
 
a9fcaee
c8b3c1b
f03db3b
a9fcaee
c8b3c1b
 
e856558
c8b3c1b
 
f03db3b
97a1db9
c8b3c1b
a9fcaee
 
 
c8b3c1b
 
 
 
97a1db9
 
c8b3c1b
a9fcaee
 
 
 
c8b3c1b
a9fcaee
e856558
c8b3c1b
 
 
 
 
 
 
 
 
 
 
f03db3b
 
e856558
c8b3c1b
 
 
 
 
 
 
 
 
f03db3b
c8b3c1b
 
 
 
 
 
 
e856558
c8b3c1b
 
f03db3b
c8b3c1b
a9fcaee
f03db3b
 
a9fcaee
 
 
 
 
c8b3c1b
a9fcaee
 
 
 
 
 
 
e856558
f03db3b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
    PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces                # <-- NEW: import spaces for ZeroGPU
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel

# 0. Environment tweaks for Accelerate (unchanged)
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

# 1. Config
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV  = "master"
OFFLOAD_DIR  = "offload"

dtype = torch.float16        # <-- use float16 for GPU
# 2. Load processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# 3. Load base model on CPU; stream shards to save RAM
base = AutoModelForVision2Seq.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
    device_map={"": "cpu"},
    offload_folder=OFFLOAD_DIR,
    trust_remote_code=True,
)

# 4. Attach LoRA adapter on CPU
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"},
).eval()

# Keep track of whether the model has been moved to GPU
_model_on_gpu = False

# 5. Inference helper – run on GPU when called
@spaces.GPU                   # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
def generate_answer(image, question: str,
                    temperature: float = 0.7,
                    top_p: float = 0.95,
                    max_tokens: int = 256):
    global _model_on_gpu
    # provide a placeholder image if none was uploaded
    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    # move model to GPU once
    if not _model_on_gpu:
        model.to("cuda")
        _model_on_gpu = True

    # prepare inputs on GPU
    inputs = processor(text=[question], images=[image],
                       return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = model.generate(**inputs,
                                    max_new_tokens=max_tokens,
                                    temperature=temperature,
                                    top_p=top_p)
    # decode on CPU
    outputs = output_ids.to("cpu")
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

# 6. Gradio UI (unchanged except for title)
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
    gr.Markdown(
        "## PetBull‑7B‑VL – Ask a Vet\n"
        "Upload a photo and/or type a question."
    )
    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(generate_answer,
              inputs=[img_in, txt_in, temp, topp, max_tok],
              outputs=answer)

demo.queue().launch()