import gradio as gr from unsloth import FastVisionModel import torch from PIL import Image from transformers import TextStreamer # Load the pre-trained model and tokenizer device = "cuda" if torch.cuda.is_available() else "cpu" model, tokenizer = FastVisionModel.from_pretrained( "sabaridsnfuji/FloorPlanVisionAIAdaptor", load_in_4bit=True, use_gradient_checkpointing="unsloth" ) FastVisionModel.for_inference(model) model.to(device) # Define the instruction instruction = """You are an expert in architecture and interior design. Analyze the floor plan image and describe accurately the key features, room count, layout, and any other important details you observe.""" # Function to process image and prompt def analyze_floorplan(image): if image is None: return "Invalid image provided. Please upload a valid image." messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]} ] input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) inputs = tokenizer( image, input_text, add_special_tokens=False, return_tensors="pt", ).to(device) text_streamer = TextStreamer(tokenizer, skip_prompt=True) output = model.generate( **inputs, streamer=text_streamer, max_new_tokens=1024, use_cache=True, temperature=1.0, min_p=0.1 ) result = tokenizer.decode(output[0], skip_special_tokens=True) return result # Gradio interface iface = gr.Interface( fn=analyze_floorplan, inputs=gr.Image(type="pil"), outputs="text", title="FloorPlan Vision AI", description="Upload a floor plan image to get a detailed architectural and interior analysis." ) iface.launch()