Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,475 Bytes
e856558 c8b3c1b e856558 c8b3c1b f03db3b 15f54ff f03db3b 15f54ff e856558 c8b3c1b a9fcaee c8b3c1b f03db3b a9fcaee c8b3c1b e856558 c8b3c1b f03db3b 97a1db9 15f54ff c8b3c1b a9fcaee 15f54ff a9fcaee c8b3c1b 97a1db9 c8b3c1b a9fcaee c8b3c1b a9fcaee e856558 c8b3c1b f03db3b e856558 c8b3c1b f03db3b c8b3c1b e856558 c8b3c1b f03db3b c8b3c1b a9fcaee f03db3b a9fcaee c8b3c1b a9fcaee e856558 f03db3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
"""
PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces # <-- NEW: import spaces for ZeroGPU
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoConfig
from peft import PeftModel
import transformers, accelerate, torch, numpy as np
print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)
# 0. Environment tweaks for Accelerate (unchanged)
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"
# 1. Config
BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV = "master"
OFFLOAD_DIR = "offload"
dtype = torch.float16 # <-- use float16 for GPU
# 2. Load processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
cfg = AutoConfig.from_pretrained(BASE_MODEL, trust_remote_code=True)
# 3. Load base model on CPU; stream shards to save RAM
base = AutoModelForVision2Seq.from_pretrained(
BASE_MODEL,
config=cfg,
torch_dtype=dtype,
low_cpu_mem_usage=True,
device_map={"": "cpu"},
offload_folder=OFFLOAD_DIR,
trust_remote_code=True,
)
# 4. Attach LoRA adapter on CPU
model = PeftModel.from_pretrained(
base,
ADAPTER_REPO,
revision=ADAPTER_REV,
device_map={"": "cpu"},
).eval()
# Keep track of whether the model has been moved to GPU
_model_on_gpu = False
# 5. Inference helper – run on GPU when called
@spaces.GPU # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
def generate_answer(image, question: str,
temperature: float = 0.7,
top_p: float = 0.95,
max_tokens: int = 256):
global _model_on_gpu
# provide a placeholder image if none was uploaded
if image is None:
image = Image.new("RGB", (224, 224), color="white")
# move model to GPU once
if not _model_on_gpu:
model.to("cuda")
_model_on_gpu = True
# prepare inputs on GPU
inputs = processor(text=[question], images=[image],
return_tensors="pt").to("cuda")
with torch.no_grad():
output_ids = model.generate(**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p)
# decode on CPU
outputs = output_ids.to("cpu")
return processor.batch_decode(outputs, skip_special_tokens=True)[0]
# 6. Gradio UI (unchanged except for title)
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
gr.Markdown(
"## PetBull‑7B‑VL – Ask a Vet\n"
"Upload a photo and/or type a question."
)
with gr.Row():
with gr.Column():
img_in = gr.Image(type="pil", label="Pet photo (optional)")
txt_in = gr.Textbox(lines=3, placeholder="Describe the issue…")
ask = gr.Button("Ask PetBull")
temp = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
topp = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
with gr.Column():
answer = gr.Textbox(lines=12, label="Assistant", interactive=False)
ask.click(generate_answer,
inputs=[img_in, txt_in, temp, topp, max_tok],
outputs=answer)
demo.queue().launch()
|