Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,044 Bytes
e856558 c8b3c1b e856558 c8b3c1b 79837da c8b3c1b f03db3b 79837da f03db3b 79837da 15f54ff e856558 79837da c8b3c1b a9fcaee 79837da f03db3b a9fcaee c8b3c1b 79837da e856558 79837da f03db3b 97a1db9 79837da e0809e5 a9fcaee 79837da c8b3c1b 97a1db9 79837da a9fcaee c8b3c1b a9fcaee e856558 79837da c8b3c1b 79837da c8b3c1b 79837da f03db3b e856558 79837da c8b3c1b 79837da c8b3c1b f03db3b 79837da c8b3c1b e856558 79837da c8b3c1b 79837da f03db3b a9fcaee c8b3c1b a9fcaee 79837da a9fcaee 79837da e856558 79837da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
"""
PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
from peft import PeftModel
import transformers, accelerate, numpy as np
print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
# 0) Safer streaming for model shards
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
# 1) Config
BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV = "master"
OFFLOAD_DIR = "offload"
DTYPE = torch.float16
# 2) Processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
# 3) Load base model ON CPU (no AutoConfig; rely on remote code)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=DTYPE,
low_cpu_mem_usage=True,
device_map={"": "cpu"},
offload_folder=OFFLOAD_DIR,
trust_remote_code=True,
)
# 4) Attach LoRA ON CPU
model = PeftModel.from_pretrained(
base,
ADAPTER_REPO,
revision=ADAPTER_REV,
device_map={"": "cpu"},
).eval()
_model_on_gpu = False # track once-per-session transfer
# 5) Inference (request GPU only for this function)
@spaces.GPU(duration=120)
def generate_answer(
image,
question: str,
temperature: float = 0.7,
top_p: float = 0.95,
max_tokens: int = 256,
) -> str:
global _model_on_gpu
if image is None:
image = Image.new("RGB", (224, 224), color="white")
# Move model to GPU once (inside GPU-decorated function)
if not _model_on_gpu:
model.to("cuda")
_model_on_gpu = True
# Prepare inputs on GPU
inputs = processor(text=[question], images=[image], return_tensors="pt")
inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
outputs = output_ids.to("cpu")
return processor.batch_decode(outputs, skip_special_tokens=True)[0]
# 6) UI
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
with gr.Row():
with gr.Column():
img_in = gr.Image(type="pil", label="Pet photo (optional)")
txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
ask = gr.Button("Ask PetBull")
temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
with gr.Column():
answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
ask.click(
generate_answer,
inputs=[img_in, txt_in, temp, topp, max_tok],
outputs=answer,
)
demo.queue().launch() |