DermalCare / app.py
ColdSlim's picture
Update app.py
a9fcaee verified
raw
history blame
4.09 kB
"""
PetBull-7B-VL demo – CPU-only, 16 GB-friendly
--------------------------------------------
• Base model : Qwen/Qwen2.5-VL-7B-Instruct
• LoRA adapter: ColdSlim/PetBull-7B (master branch)
This script:
✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
✓ streams weights to avoid peak memory spikes
✓ off-loads large tensors to disk when RAM is tight
"""
import os, torch, gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel
# ---------------------------------------------------------------------
# 0 Env tweaks for Hugging Face Accelerate
# ---------------------------------------------------------------------
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # safer streaming
# ---------------------------------------------------------------------
# 1 Config
# ---------------------------------------------------------------------
BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV = "master" # your model repo branch
OFFLOAD_DIR = "offload" # folder on disk for big tensors
device = "cpu" # force CPU
dtype = torch.bfloat16 # lighter than FP16 on modern CPUs
# ---------------------------------------------------------------------
# 2 Load processor (tiny)
# ---------------------------------------------------------------------
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
# ---------------------------------------------------------------------
# 3 Load base model with memory-savvy flags
# ---------------------------------------------------------------------
base = AutoModelForVision2Seq.from_pretrained(
BASE_MODEL,
torch_dtype=dtype,
low_cpu_mem_usage=True, # stream shards
device_map={"": "cpu"}, # everything on CPU
offload_folder=OFFLOAD_DIR, # mmap big tensors to disk
trust_remote_code=True
)
# ---------------------------------------------------------------------
# 4 Attach LoRA
# ---------------------------------------------------------------------
model = PeftModel.from_pretrained(
base,
ADAPTER_REPO,
revision=ADAPTER_REV,
device_map={"": "cpu"}
).eval()
# ---------------------------------------------------------------------
# 5 Inference helper
# ---------------------------------------------------------------------
def generate_answer(
image: Image.Image | None,
question: str,
temperature: float = 0.7,
top_p: float = 0.95,
max_tokens: int = 256, # keep small for RAM headroom
) -> str:
if image is None:
image = Image.new("RGB", (224, 224), color="white")
inputs = processor(text=[question], images=[image], return_tensors="pt")
with torch.no_grad():
output_ids = model.generate(
**inputs, max_new_tokens=max_tokens,
temperature=temperature, top_p=top_p
)
return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
# ---------------------------------------------------------------------
# 6 Gradio UI
# ---------------------------------------------------------------------
with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
gr.Markdown(
"## 🐾 PetBull-7B-VL – Ask a Vet\n"
"Upload a photo and/or type a question."
)
with gr.Row():
with gr.Column():
img_in = gr.Image(type="pil", label="Pet photo (optional)")
txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
ask = gr.Button("Ask PetBull")
temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
topp = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
with gr.Column():
answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
ask.click(generate_answer,
inputs=[img_in, txt_in, temp, topp, max_tok],
outputs=answer)
demo.queue().launch()