"""
PetBull-7B-VL demo – CPU-only, 16 GB-friendly
--------------------------------------------

• Base model  : Qwen/Qwen2.5-VL-7B-Instruct
• LoRA adapter: ColdSlim/PetBull-7B   (master branch)

This script:
  ✓ loads in bfloat16 (saves ~25 % RAM vs FP16)
  ✓ streams weights to avoid peak memory spikes
  ✓ off-loads large tensors to disk when RAM is tight
"""

import os, torch, gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel

# ---------------------------------------------------------------------
# 0 Env tweaks for Hugging Face Accelerate
# ---------------------------------------------------------------------
os.environ["ACCELERATE_USE_SLOW_RETRIEVAL"] = "true"     # safer streaming

# ---------------------------------------------------------------------
# 1 Config
# ---------------------------------------------------------------------
BASE_MODEL   = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_REPO = "ColdSlim/PetBull-7B"
ADAPTER_REV  = "master"          # your model repo branch
OFFLOAD_DIR  = "offload"         # folder on disk for big tensors

device = "cpu"                   # force CPU
dtype  = torch.bfloat16          # lighter than FP16 on modern CPUs

# ---------------------------------------------------------------------
# 2 Load processor (tiny)
# ---------------------------------------------------------------------
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

# ---------------------------------------------------------------------
# 3 Load base model with memory-savvy flags
# ---------------------------------------------------------------------
base = AutoModelForVision2Seq.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,          # stream shards
    device_map={"": "cpu"},          # everything on CPU
    offload_folder=OFFLOAD_DIR,      # mmap big tensors to disk
    trust_remote_code=True
)

# ---------------------------------------------------------------------
# 4 Attach LoRA
# ---------------------------------------------------------------------
model = PeftModel.from_pretrained(
    base,
    ADAPTER_REPO,
    revision=ADAPTER_REV,
    device_map={"": "cpu"}
).eval()

# ---------------------------------------------------------------------
# 5 Inference helper
# ---------------------------------------------------------------------
def generate_answer(
    image: Image.Image | None,
    question: str,
    temperature: float = 0.7,
    top_p: float = 0.95,
    max_tokens: int = 256,           # keep small for RAM headroom
) -> str:
    if image is None:
        image = Image.new("RGB", (224, 224), color="white")

    inputs = processor(text=[question], images=[image], return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs, max_new_tokens=max_tokens,
            temperature=temperature, top_p=top_p
        )
    return processor.batch_decode(output_ids, skip_special_tokens=True)[0]

# ---------------------------------------------------------------------
# 6 Gradio UI
# ---------------------------------------------------------------------
with gr.Blocks(title="PetBull-7B-VL (CPU)") as demo:
    gr.Markdown(
        "## 🐾 PetBull-7B-VL – Ask a Vet\n"
        "Upload a photo and/or type a question."
    )

    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil", label="Pet photo (optional)")
            txt_in  = gr.Textbox(lines=3, placeholder="Describe the issue…")
            ask     = gr.Button("Ask PetBull")
            temp    = gr.Slider(0.1, 1.5, 0.7, label="Temperature")
            topp    = gr.Slider(0.1, 1.0, 0.95, label="Top-p")
            max_tok = gr.Slider(32, 512, 256, step=8, label="Max tokens")
        with gr.Column():
            answer  = gr.Textbox(lines=12, label="Assistant", interactive=False)

    ask.click(generate_answer,
              inputs=[img_in, txt_in, temp, topp, max_tok],
              outputs=answer)

demo.queue().launch()