""" PetBull-7B-VL demo – CPU-only, 16 GB-friendly -------------------------------------------- • Base model : Qwen/Qwen2.5-VL-7B-Instruct • LoRA adapter: ColdSlim/PetBull-7B (master branch) This script: ✓ loads in bfloat16 (saves ~25 % RAM vs FP16) ✓ streams weights to avoid peak memory spikes ✓ off-loads large tensors to disk when RAM is tight """ import os, torch, gradio as gr from PIL import Image from transformers import AutoProcessor, AutoModelForVision2Seq from peft import PeftModel # --------------------------------------------------------------------- # 0 Env tweaks for Hugging Face Accelerate # --------------------------------------------------------------------- os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # safer streaming # --------------------------------------------------------------------- # 1 Config # --------------------------------------------------------------------- BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" ADAPTER_REPO = "ColdSlim/PetBull-7B" ADAPTER_REV = "master" # your model repo branch OFFLOAD_DIR = "offload" # folder on disk for big tensors device = "cpu" # force CPU dtype = torch.bfloat16 # lighter than FP16 on modern CPUs # --------------------------------------------------------------------- # 2 Load processor (tiny) # --------------------------------------------------------------------- processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) # --------------------------------------------------------------------- # 3 Load base model with memory-savvy flags # --------------------------------------------------------------------- base = AutoModelForVision2Seq.from_pretrained( BASE_MODEL, torch_dtype=dtype, low_cpu_mem_usage=True, # stream shards device_map={"": "cpu"}, # everything on CPU offload_folder=OFFLOAD_DIR, # mmap big tensors to disk trust_remote_code=True ) # --------------------------------------------------------------------- # 4 Attach LoRA # --------------------------------------------------------------------- model = PeftModel.from_pretrained( base, ADAPTER_REPO, revision=ADAPTER_REV, device_map={"": "cpu"} ).eval() # --------------------------------------------------------------------- # 5 Inference helper # --------------------------------------------------------------------- def generate_answer( image: Image.Image | None, question: str, temperature: float = 0.7, top_p: float = 0.95, max_tokens: int = 256, # keep small for RAM headroom ) -> str: if image is None: image = Image.new("RGB", (224, 224), color="white") inputs = processor(text=[question], images=[image], return_tensors="pt") with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p ) return processor.batch_decode(output_ids, skip_special_tokens=True)[0] # --------------------------------------------------------------------- # 6 Gradio UI # --------------------------------------------------------------------- with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo: gr.Markdown( "## 🐾 PetBull-7B-VL – Ask a Vet\n" "Upload a photo and/or type a question." ) with gr.Row(): with gr.Column(): img_in = gr.Image(type="pil", label="Pet photo (optional)") txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…") ask = gr.Button("Ask PetBull") temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature") topp = gr.Slider(0.1, 1.0, 0.95, label="Top-p") max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens") with gr.Column(): answer = gr.Textbox(lines=12, label="Assistant", interactive=False) ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer) demo.queue().launch()