File size: 3,044 Bytes
e856558
c8b3c1b
e856558
c8b3c1b
 
79837da
c8b3c1b
f03db3b
79837da
f03db3b
79837da
15f54ff
 
e856558
79837da
c8b3c1b
a9fcaee
79837da
f03db3b
a9fcaee
c8b3c1b
 
79837da
e856558
79837da
f03db3b
97a1db9
79837da
e0809e5
a9fcaee
79837da
c8b3c1b
 
 
 
97a1db9
 
79837da
a9fcaee
 
 
 
c8b3c1b
a9fcaee
e856558
79837da
c8b3c1b
79837da
 
 
 
 
 
 
 
 
c8b3c1b
79837da
f03db3b
 
e856558
79837da
c8b3c1b
 
 
 
79837da
 
 
c8b3c1b
f03db3b
79837da
 
 
 
 
 
 
c8b3c1b
 
e856558
79837da
c8b3c1b
79837da
f03db3b
a9fcaee
 
 
 
 
c8b3c1b
a9fcaee
 
79837da
a9fcaee
79837da
 
 
 
 
e856558
79837da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
    PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
from peft import PeftModel
import transformers, accelerate, numpy as np

print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)

# 0) Safer streaming for model shards
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

# 1) Config
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV  = "master"
OFFLOAD_DIR  = "offload"
DTYPE        = torch.float16

# 2) Processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# 3) Load base model ON CPU (no AutoConfig; rely on remote code)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True,
    device_map={"": "cpu"},
    offload_folder=OFFLOAD_DIR,
    trust_remote_code=True,
)

# 4) Attach LoRA ON CPU
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"},
).eval()

_model_on_gpu = False  # track once-per-session transfer

# 5) Inference (request GPU only for this function)
@spaces.GPU(duration=120)
def generate_answer(
    image,
    question: str,
    temperature: float = 0.7,
    top_p: float = 0.95,
    max_tokens: int = 256,
) -> str:
    global _model_on_gpu

    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    # Move model to GPU once (inside GPU-decorated function)
    if not _model_on_gpu:
        model.to("cuda")
        _model_on_gpu = True

    # Prepare inputs on GPU
    inputs = processor(text=[question], images=[image], return_tensors="pt")
    inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )

    outputs = output_ids.to("cpu")
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

# 6) UI
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
    gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(
        generate_answer,
        inputs=[img_in, txt_in, temp, topp, max_tok],
        outputs=answer,
    )

demo.queue().launch()