import gradio as gr
from unsloth import FastVisionModel
import torch
from PIL import Image
from transformers import TextStreamer

# Load the pre-trained model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"

model, tokenizer = FastVisionModel.from_pretrained(
    "sabaridsnfuji/FloorPlanVisionAIAdaptor",
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth"
)

FastVisionModel.for_inference(model)
model.to(device)

# Define the instruction
instruction = """You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe."""

# Function to process image and prompt
def analyze_floorplan(image):
    if image is None:
        return "Invalid image provided. Please upload a valid image."

    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to(device)

    text_streamer = TextStreamer(tokenizer, skip_prompt=True)

    output = model.generate(
        **inputs,
        streamer=text_streamer,
        max_new_tokens=1024,
        use_cache=True,
        temperature=1.0,
        min_p=0.1
    )

    result = tokenizer.decode(output[0], skip_special_tokens=True)
    return result

# Gradio interface
iface = gr.Interface(
    fn=analyze_floorplan,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="FloorPlan Vision AI",
    description="Upload a floor plan image to get a detailed architectural and interior analysis."
)

iface.launch()