Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,221 Bytes
e856558 c8b3c1b e856558 c8b3c1b f03db3b e856558 c8b3c1b a9fcaee c8b3c1b f03db3b a9fcaee c8b3c1b e856558 c8b3c1b f03db3b 97a1db9 c8b3c1b a9fcaee c8b3c1b 97a1db9 c8b3c1b a9fcaee c8b3c1b a9fcaee e856558 c8b3c1b f03db3b e856558 c8b3c1b f03db3b c8b3c1b e856558 c8b3c1b f03db3b c8b3c1b a9fcaee f03db3b a9fcaee c8b3c1b a9fcaee e856558 f03db3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
"""
PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces # <-- NEW: import spaces for ZeroGPU
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel
# 0. Environment tweaks for Accelerate (unchanged)
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
# 1. Config
BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV = "master"
OFFLOAD_DIR = "offload"
dtype = torch.float16 # <-- use float16 for GPU
# 2. Load processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
# 3. Load base model on CPU; stream shards to save RAM
base = AutoModelForVision2Seq.from_pretrained(
BASE_MODEL,
torch_dtype=dtype,
low_cpu_mem_usage=True,
device_map={"": "cpu"},
offload_folder=OFFLOAD_DIR,
trust_remote_code=True,
)
# 4. Attach LoRA adapter on CPU
model = PeftModel.from_pretrained(
base,
ADAPTER_REPO,
revision=ADAPTER_REV,
device_map={"": "cpu"},
).eval()
# Keep track of whether the model has been moved to GPU
_model_on_gpu = False
# 5. Inference helper – run on GPU when called
@spaces.GPU # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
def generate_answer(image, question: str,
temperature: float = 0.7,
top_p: float = 0.95,
max_tokens: int = 256):
global _model_on_gpu
# provide a placeholder image if none was uploaded
if image is None:
image = Image.new("RGB", (224, 224), color="white")
# move model to GPU once
if not _model_on_gpu:
model.to("cuda")
_model_on_gpu = True
# prepare inputs on GPU
inputs = processor(text=[question], images=[image],
return_tensors="pt").to("cuda")
with torch.no_grad():
output_ids = model.generate(**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p)
# decode on CPU
outputs = output_ids.to("cpu")
return processor.batch_decode(outputs, skip_special_tokens=True)[0]
# 6. Gradio UI (unchanged except for title)
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
gr.Markdown(
"## PetBull‑7B‑VL – Ask a Vet\n"
"Upload a photo and/or type a question."
)
with gr.Row():
with gr.Column():
img_in = gr.Image(type="pil", label="Pet photo (optional)")
txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
ask = gr.Button("Ask PetBull")
temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
with gr.Column():
answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
ask.click(generate_answer,
inputs=[img_in, txt_in, temp, topp, max_tok],
outputs=answer)
demo.queue().launch()
|