""" PetBull‑7B‑VL demo – ZeroGPU‑ready (Qwen2.5‑VL API) """ import os import spaces import torch import gradio as gr from PIL import Image from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration from peft import PeftModel from qwen_vl_utils import process_vision_info # pip install qwen-vl-utils import transformers, accelerate, numpy as np print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__) os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true" # ---- Config ---- BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" ADAPTER_REPO = "ColdSlim/PetBull-7B" # your LoRA ADAPTER_REV = "master" OFFLOAD_DIR = "offload" DTYPE = torch.float16 # ---- Processor (no GPU) ---- processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) # ---- Base model ON CPU (do NOT touch CUDA here) ---- base = Qwen2_5_VLForConditionalGeneration.from_pretrained( BASE_MODEL, torch_dtype=DTYPE, low_cpu_mem_usage=True, device_map={"": "cpu"}, offload_folder=OFFLOAD_DIR, trust_remote_code=True, ) # ---- Attach LoRA ON CPU ---- model = PeftModel.from_pretrained( base, ADAPTER_REPO, revision=ADAPTER_REV, device_map={"": "cpu"}, ).eval() _model_on_gpu = False # once-per-session move # ---- Inference on GPU (ZeroGPU pattern) ---- @spaces.GPU(duration=120) def generate_answer(image, question, temperature=0.7, top_p=0.95, max_tokens=256): """ Uses Qwen2.5-VL chat template + qwen_vl_utils to prepare image+text, then generate. """ global _model_on_gpu if image is None: image = Image.new("RGB", (224, 224), color="white") if not _model_on_gpu: model.to("cuda") _model_on_gpu = True # Build chat messages in Qwen format messages = [{ "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": question or "Describe this image."}, ], }] # Processor helpers text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) # Pack tensors on GPU inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = {k: (v.to("cuda") if hasattr(v, "to") else v) for k, v in inputs.items()} with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, ) # Trim prompt tokens before decode (Qwen style) trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)] return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] # ---- UI ---- with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU, Qwen2.5‑VL)") as demo: gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.") with gr.Row(): with gr.Column(): img_in = gr.Image(type="pil", label="Pet photo (optional)") txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…") ask = gr.Button("Ask PetBull") temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature") topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p") max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens") with gr.Column(): answer = gr.Textbox(lines=12, label="Assistant", interactive=False) ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer) demo.queue().launch(show_api=False, share=True)