""" PetBull‑7B‑VL demo – ZeroGPU‑ready """ import os import torch import spaces import gradio as gr from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM from peft import PeftModel import transformers, accelerate, numpy as np print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__) # 0) Safer streaming for model shards os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # 1) Config BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" ADAPTER_REPO = "ColdSlim/PetBull-7B" ADAPTER_REV = "master" OFFLOAD_DIR = "offload" DTYPE = torch.float16 # 2) Processor processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) # 3) Load base model ON CPU (no AutoConfig; rely on remote code) base = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=DTYPE, low_cpu_mem_usage=True, device_map={"": "cpu"}, offload_folder=OFFLOAD_DIR, trust_remote_code=True, ) # 4) Attach LoRA ON CPU model = PeftModel.from_pretrained( base, ADAPTER_REPO, revision=ADAPTER_REV, device_map={"": "cpu"}, ).eval() _model_on_gpu = False # track once-per-session transfer # 5) Inference (request GPU only for this function) @spaces.GPU(duration=120) def generate_answer( image, question: str, temperature: float = 0.7, top_p: float = 0.95, max_tokens: int = 256, ) -> str: global _model_on_gpu if image is None: image = Image.new("RGB", (224, 224), color="white") # Move model to GPU once (inside GPU-decorated function) if not _model_on_gpu: model.to("cuda") _model_on_gpu = True # Prepare inputs on GPU inputs = processor(text=[question], images=[image], return_tensors="pt") inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()} with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, ) outputs = output_ids.to("cpu") return processor.batch_decode(outputs, skip_special_tokens=True)[0] # 6) UI with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo: gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.") with gr.Row(): with gr.Column(): img_in = gr.Image(type="pil", label="Pet photo (optional)") txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…") ask = gr.Button("Ask PetBull") temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature") topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p") max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens") with gr.Column(): answer = gr.Textbox(lines=12, label="Assistant", interactive=False) ask.click( generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer, ) demo.queue().launch()