Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,757 Bytes
e856558 c818037 e856558 c8b3c1b 79837da c818037 c8b3c1b f03db3b c818037 f03db3b c818037 79837da 15f54ff c8b3c1b a9fcaee c818037 f03db3b c818037 c8b3c1b 79837da e856558 c818037 f03db3b 97a1db9 c818037 a9fcaee 79837da c8b3c1b 97a1db9 c818037 a9fcaee c8b3c1b a9fcaee e856558 c818037 c8b3c1b c818037 79837da c818037 c8b3c1b f03db3b e856558 c8b3c1b c818037 c8b3c1b f03db3b c818037 79837da c818037 e856558 c818037 79837da f03db3b a9fcaee c8b3c1b a9fcaee 79837da a9fcaee c818037 e856558 4f1ff55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
"""
PetBull‑7B‑VL demo – ZeroGPU‑ready (Qwen2.5‑VL API)
"""
import os
import spaces
import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from peft import PeftModel
from qwen_vl_utils import process_vision_info # pip install qwen-vl-utils
import transformers, accelerate, numpy as np
print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
# ---- Config ----
BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B" # your LoRA
ADAPTER_REV = "master"
OFFLOAD_DIR = "offload"
DTYPE = torch.float16
# ---- Processor (no GPU) ----
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
# ---- Base model ON CPU (do NOT touch CUDA here) ----
base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
BASE_MODEL,
torch_dtype=DTYPE,
low_cpu_mem_usage=True,
device_map={"": "cpu"},
offload_folder=OFFLOAD_DIR,
trust_remote_code=True,
)
# ---- Attach LoRA ON CPU ----
model = PeftModel.from_pretrained(
base,
ADAPTER_REPO,
revision=ADAPTER_REV,
device_map={"": "cpu"},
).eval()
_model_on_gpu = False # once-per-session move
# ---- Inference on GPU (ZeroGPU pattern) ----
@spaces.GPU(duration=120)
def generate_answer(image, question, temperature=0.7, top_p=0.95, max_tokens=256):
"""
Uses Qwen2.5-VL chat template + qwen_vl_utils to prepare image+text, then generate.
"""
global _model_on_gpu
if image is None:
image = Image.new("RGB", (224, 224), color="white")
if not _model_on_gpu:
model.to("cuda")
_model_on_gpu = True
# Build chat messages in Qwen format
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question or "Describe this image."},
],
}]
# Processor helpers
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
# Pack tensors on GPU
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = {k: (v.to("cuda") if hasattr(v, "to") else v) for k, v in inputs.items()}
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
# Trim prompt tokens before decode (Qwen style)
trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
return processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# ---- UI ----
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU, Qwen2.5‑VL)") as demo:
gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
with gr.Row():
with gr.Column():
img_in = gr.Image(type="pil", label="Pet photo (optional)")
txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
ask = gr.Button("Ask PetBull")
temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
with gr.Column():
answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
ask.click(generate_answer, inputs=[img_in, txt_in, temp, topp, max_tok], outputs=answer)
demo.queue().launch(show_api=False) |