File size: 4,093 Bytes
e856558
a9fcaee
 
f03db3b
 
a9fcaee
f03db3b
a9fcaee
 
 
 
e856558
 
a9fcaee
f03db3b
 
 
e856558
f03db3b
a9fcaee
 
 
 
 
 
f03db3b
 
a9fcaee
 
 
e856558
a9fcaee
 
e856558
a9fcaee
 
 
f03db3b
97a1db9
a9fcaee
 
 
 
 
 
 
 
 
 
97a1db9
 
a9fcaee
 
 
 
 
 
 
 
 
e856558
f03db3b
a9fcaee
f03db3b
a9fcaee
 
 
 
 
 
 
f03db3b
 
e856558
a9fcaee
f03db3b
a9fcaee
 
 
 
 
e856558
f03db3b
a9fcaee
f03db3b
a9fcaee
f03db3b
a9fcaee
 
f03db3b
e856558
f03db3b
a9fcaee
 
 
 
 
 
 
 
 
 
 
 
 
e856558
f03db3b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
PetBull-7B-VL demo – CPU-only, 16 GB-friendly
--------------------------------------------

• Base model  : Qwen/Qwen2.5-VL-7B-Instruct
• LoRA adapter: ColdSlim/PetBull-7B   (master branch)

This script:
  ✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
  ✓ streams weights to avoid peak memory spikes
  ✓ off-loads large tensors to disk when RAM is tight
"""

import os, torch, gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel

# ---------------------------------------------------------------------
# 0 Env tweaks for Hugging Face Accelerate
# ---------------------------------------------------------------------
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"     # safer streaming

# ---------------------------------------------------------------------
# 1 Config
# ---------------------------------------------------------------------
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV  = "master"          # your model repo branch
OFFLOAD_DIR  = "offload"         # folder on disk for big tensors

device = "cpu"                   # force CPU
dtype  = torch.bfloat16          # lighter than FP16 on modern CPUs

# ---------------------------------------------------------------------
# 2 Load processor (tiny)
# ---------------------------------------------------------------------
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# ---------------------------------------------------------------------
# 3 Load base model with memory-savvy flags
# ---------------------------------------------------------------------
base = AutoModelForVision2Seq.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,          # stream shards
    device_map={"": "cpu"},          # everything on CPU
    offload_folder=OFFLOAD_DIR,      # mmap big tensors to disk
    trust_remote_code=True
)

# ---------------------------------------------------------------------
# 4 Attach LoRA
# ---------------------------------------------------------------------
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"}
).eval()

# ---------------------------------------------------------------------
# 5 Inference helper
# ---------------------------------------------------------------------
def generate_answer(
    image: Image.Image | None,
    question: str,
    temperature: float = 0.7,
    top_p: float = 0.95,
    max_tokens: int = 256,           # keep small for RAM headroom
) -> str:
    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    inputs = processor(text=[question], images=[image], return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs, max_new_tokens=max_tokens,
            temperature=temperature, top_p=top_p
        )
    return processor.batch_decode(output_ids, skip_special_tokens=True)[0]

# ---------------------------------------------------------------------
# 6 Gradio UI
# ---------------------------------------------------------------------
with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
    gr.Markdown(
        "## 🐾 PetBull-7B-VL – Ask a Vet\n"
        "Upload a photo and/or type a question."
    )

    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(generate_answer,
              inputs=[img_in, txt_in, temp, topp, max_tok],
              outputs=answer)

demo.queue().launch()