"""
    PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces                # <-- NEW: import spaces for ZeroGPU
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel

# 0. Environment tweaks for Accelerate (unchanged)
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

# 1. Config
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV  = "master"
OFFLOAD_DIR  = "offload"

dtype = torch.float16        # <-- use float16 for GPU
# 2. Load processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# 3. Load base model on CPU; stream shards to save RAM
base = AutoModelForVision2Seq.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
    device_map={"": "cpu"},
    offload_folder=OFFLOAD_DIR,
    trust_remote_code=True,
)

# 4. Attach LoRA adapter on CPU
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"},
).eval()

# Keep track of whether the model has been moved to GPU
_model_on_gpu = False

# 5. Inference helper – run on GPU when called
@spaces.GPU                   # <-- NEW: request GPU for this function:contentReference[oaicite:3]{index=3}
def generate_answer(image, question: str,
                    temperature: float = 0.7,
                    top_p: float = 0.95,
                    max_tokens: int = 256):
    global _model_on_gpu
    # provide a placeholder image if none was uploaded
    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    # move model to GPU once
    if not _model_on_gpu:
        model.to("cuda")
        _model_on_gpu = True

    # prepare inputs on GPU
    inputs = processor(text=[question], images=[image],
                       return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = model.generate(**inputs,
                                    max_new_tokens=max_tokens,
                                    temperature=temperature,
                                    top_p=top_p)
    # decode on CPU
    outputs = output_ids.to("cpu")
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

# 6. Gradio UI (unchanged except for title)
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
    gr.Markdown(
        "## PetBull‑7B‑VL – Ask a Vet\n"
        "Upload a photo and/or type a question."
    )
    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(generate_answer,
              inputs=[img_in, txt_in, temp, topp, max_tok],
              outputs=answer)

demo.queue().launch()