Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,093 Bytes
e856558 a9fcaee f03db3b a9fcaee f03db3b a9fcaee e856558 a9fcaee f03db3b e856558 f03db3b a9fcaee f03db3b a9fcaee e856558 a9fcaee e856558 a9fcaee f03db3b 97a1db9 a9fcaee 97a1db9 a9fcaee e856558 f03db3b a9fcaee f03db3b a9fcaee f03db3b e856558 a9fcaee f03db3b a9fcaee e856558 f03db3b a9fcaee f03db3b a9fcaee f03db3b a9fcaee f03db3b e856558 f03db3b a9fcaee e856558 f03db3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
"""
PetBull-7B-VL demo – CPU-only, 16 GB-friendly
--------------------------------------------
• Base model : Qwen/Qwen2.5-VL-7B-Instruct
• LoRA adapter: ColdSlim/PetBull-7B (master branch)
This script:
✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
✓ streams weights to avoid peak memory spikes
✓ off-loads large tensors to disk when RAM is tight
"""
import os, torch, gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel
# ---------------------------------------------------------------------
# 0 Env tweaks for Hugging Face Accelerate
# ---------------------------------------------------------------------
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # safer streaming
# ---------------------------------------------------------------------
# 1 Config
# ---------------------------------------------------------------------
BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV = "master" # your model repo branch
OFFLOAD_DIR = "offload" # folder on disk for big tensors
device = "cpu" # force CPU
dtype = torch.bfloat16 # lighter than FP16 on modern CPUs
# ---------------------------------------------------------------------
# 2 Load processor (tiny)
# ---------------------------------------------------------------------
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
# ---------------------------------------------------------------------
# 3 Load base model with memory-savvy flags
# ---------------------------------------------------------------------
base = AutoModelForVision2Seq.from_pretrained(
BASE_MODEL,
torch_dtype=dtype,
low_cpu_mem_usage=True, # stream shards
device_map={"": "cpu"}, # everything on CPU
offload_folder=OFFLOAD_DIR, # mmap big tensors to disk
trust_remote_code=True
)
# ---------------------------------------------------------------------
# 4 Attach LoRA
# ---------------------------------------------------------------------
model = PeftModel.from_pretrained(
base,
ADAPTER_REPO,
revision=ADAPTER_REV,
device_map={"": "cpu"}
).eval()
# ---------------------------------------------------------------------
# 5 Inference helper
# ---------------------------------------------------------------------
def generate_answer(
image: Image.Image | None,
question: str,
temperature: float = 0.7,
top_p: float = 0.95,
max_tokens: int = 256, # keep small for RAM headroom
) -> str:
if image is None:
image = Image.new("RGB", (224, 224), color="white")
inputs = processor(text=[question], images=[image], return_tensors="pt")
with torch.no_grad():
output_ids = model.generate(
**inputs, max_new_tokens=max_tokens,
temperature=temperature, top_p=top_p
)
return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
# ---------------------------------------------------------------------
# 6 Gradio UI
# ---------------------------------------------------------------------
with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
gr.Markdown(
"## 🐾 PetBull-7B-VL – Ask a Vet\n"
"Upload a photo and/or type a question."
)
with gr.Row():
with gr.Column():
img_in = gr.Image(type="pil", label="Pet photo (optional)")
txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
ask = gr.Button("Ask PetBull")
temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
topp = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
with gr.Column():
answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
ask.click(generate_answer,
inputs=[img_in, txt_in, temp, topp, max_tok],
outputs=answer)
demo.queue().launch()
|