"""
    PetBull‑7B‑VL demo – ZeroGPU‑ready
"""
import os
import torch
import spaces
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
from peft import PeftModel
import transformers, accelerate, numpy as np

print("VERSIONS:", transformers.__version__, accelerate.__version__, torch.__version__, np.__version__)

# 0) Safer streaming for model shards
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"

# 1) Config
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV  = "master"
OFFLOAD_DIR  = "offload"
DTYPE        = torch.float16

# 2) Processor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# 3) Load base model ON CPU (no AutoConfig; rely on remote code)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True,
    device_map={"": "cpu"},
    offload_folder=OFFLOAD_DIR,
    trust_remote_code=True,
)

# 4) Attach LoRA ON CPU
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"},
).eval()

_model_on_gpu = False  # track once-per-session transfer

# 5) Inference (request GPU only for this function)
@spaces.GPU(duration=120)
def generate_answer(
    image,
    question: str,
    temperature: float = 0.7,
    top_p: float = 0.95,
    max_tokens: int = 256,
) -> str:
    global _model_on_gpu

    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    # Move model to GPU once (inside GPU-decorated function)
    if not _model_on_gpu:
        model.to("cuda")
        _model_on_gpu = True

    # Prepare inputs on GPU
    inputs = processor(text=[question], images=[image], return_tensors="pt")
    inputs = {k: v.to("cuda") if hasattr(v, "to") else v for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )

    outputs = output_ids.to("cpu")
    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

# 6) UI
with gr.Blocks(title="PetBull‑7B‑VL (ZeroGPU)") as demo:
    gr.Markdown("## PetBull‑7B‑VL – Ask a Vet\nUpload a photo and/or type a question.")
    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top‑p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(
        generate_answer,
        inputs=[img_in, txt_in, temp, topp, max_tok],
        outputs=answer,
    )

demo.queue().launch()