Spaces:
Running
on
Zero
Running
on
Zero
# UVIS - Gradio App with Upload, URL & Video Support | |
""" | |
This script launches the UVIS (Unified Visual Intelligence System) as a Gradio Web App. | |
Supports image, video, and URL-based media inputs for detection, segmentation, and depth estimation. | |
Outputs include scene blueprint, structured JSON, and downloadable results. | |
""" | |
import time | |
import logging | |
import gradio as gr | |
from PIL import Image | |
import cv2 | |
import timeout_decorator | |
import spaces | |
from registry import get_model | |
from core.describe_scene import describe_scene | |
from core.process import process_image | |
from core.input_handler import resolve_input, validate_video, validate_image | |
from utils.helpers import format_error, generate_session_id | |
from huggingface_hub import hf_hub_download | |
# Setup logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Model mappings | |
DETECTION_MODEL_MAP = { | |
"YOLOv5-Nano": "yolov5n-seg", | |
"YOLOv5-Small": "yolov5s-seg", | |
"YOLOv8-Small": "yolov8s", | |
"YOLOv8-Large": "yolov8l", | |
"RT-DETR": "rtdetr" # For future support | |
} | |
SEGMENTATION_MODEL_MAP = { | |
"SegFormer-B0": "nvidia/segformer-b0-finetuned-ade-512-512", | |
"SegFormer-B5": "nvidia/segformer-b5-finetuned-ade-512-512", | |
"DeepLabV3-ResNet50": "deeplabv3_resnet50" | |
} | |
DEPTH_MODEL_MAP = { | |
"MiDaS v21 Small 256": "midas_v21_small_256", | |
"MiDaS v21 384": "midas_v21_384", | |
"DPT Hybrid 384": "dpt_hybrid_384", | |
"DPT Swin2 Large 384": "dpt_swin2_large_384", | |
"DPT Beit Large 512": "dpt_beit_large_512" | |
} | |
# Resource Limits | |
MAX_IMAGE_MB = 5 | |
MAX_IMAGE_RES = (1920, 1080) | |
MAX_VIDEO_MB = 50 | |
MAX_VIDEO_DURATION = 30 # seconds | |
def preload_models(): | |
""" | |
This function is needed to activate ZeroGPU. It must be decorated with @spaces.GPU. | |
It can be used to warm up models or load them into memory. | |
""" | |
from registry import get_model | |
print("Warming up models for ZeroGPU...") | |
get_model("detection", "yolov5n-seg", device="cpu") | |
get_model("segmentation", "deeplabv3_resnet50", device="cpu") | |
get_model("depth", "midas_v21_small_256", device="cpu") | |
# Main Handler | |
def handle(mode, media_upload, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend): | |
""" | |
Master handler for resolving input and processing. | |
Returns outputs for Gradio interface. | |
""" | |
session_id = generate_session_id() | |
logger.info(f"Session ID: {session_id} | Handler activated with mode: {mode}") | |
start_time = time.time() | |
media = resolve_input(mode, media_upload, url) | |
if not media: | |
return None, format_error("No valid input provided. Please check your upload or URL."), None | |
results = [] | |
for single_media in media: | |
if isinstance(single_media, str): # Video file | |
valid, err = validate_video(single_media) | |
if not valid: | |
return None, format_error(err), None | |
cap = cv2.VideoCapture(single_media) | |
ret, frame = cap.read() | |
cap.release() | |
if not ret: | |
return None, format_error("Failed to read video frame."), None | |
single_media = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
if isinstance(single_media, Image.Image): | |
valid, err = validate_image(single_media) | |
if not valid: | |
return None, format_error(err), None | |
try: | |
return process_image(single_media, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend) | |
except timeout_decorator.timeout_decorator.TimeoutError: | |
logger.error("Image processing timed out.") | |
return None, format_error("Processing timed out. Try a smaller image or simpler model."), None | |
logger.warning("Unsupported media type resolved.") | |
log_runtime(start_time) | |
return None, format_error("Invalid input. Please check your upload or URL."), None | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Unified Visual Intelligence System (UVIS)") | |
with gr.Row(): | |
# left panel | |
with gr.Column(scale=2): | |
# Input Mode Toggle | |
mode = gr.Radio(["Upload", "URL"], value="Upload", label="Input Mode") | |
# File upload: accepts multiple images or one video (user chooses wisely) | |
media_upload = gr.File( | |
label="Upload Images (1–5) or 1 Video", | |
file_types=["image", ".mp4", ".mov", ".avi"], | |
file_count="multiple" | |
) | |
# URL input | |
url = gr.Textbox(label="URL (Image/Video)", visible=False) | |
# Toggle visibility | |
def toggle_inputs(selected_mode): | |
return [ | |
gr.update(visible=(selected_mode == "Upload")), # media_upload | |
gr.update(visible=(selected_mode == "URL")) # url | |
] | |
mode.change(toggle_inputs, inputs=mode, outputs=[media_upload, url]) | |
# Visibility logic function | |
def toggle_visibility(checked): | |
return gr.update(visible=checked) | |
def toggle_det_visibility(checked): | |
return [gr.update(visible=checked), gr.update(visible=checked)] | |
run_det = gr.Checkbox(label="Object Detection") | |
run_seg = gr.Checkbox(label="Semantic Segmentation") | |
run_depth = gr.Checkbox(label="Depth Estimation") | |
with gr.Row(): | |
with gr.Column(visible=False) as OD_Settings: | |
with gr.Accordion("Object Detection Settings", open=True): | |
det_model = gr.Dropdown(choices=list(DETECTION_MODEL_MAP), label="Detection Model") | |
det_confidence = gr.Slider(0.1, 1.0, 0.5, label="Detection Confidence Threshold") | |
with gr.Column(visible=False) as SS_Settings: | |
with gr.Accordion("Semantic Segmentation Settings", open=True): | |
seg_model = gr.Dropdown(choices=list(SEGMENTATION_MODEL_MAP), label="Segmentation Model") | |
with gr.Column(visible=False) as DE_Settings: | |
with gr.Accordion("Depth Estimation Settings", open=True): | |
depth_model = gr.Dropdown(choices=list(DEPTH_MODEL_MAP), label="Depth Model") | |
# Attach Visibility Logic | |
run_det.change(fn=toggle_visibility, inputs=[run_det], outputs=[OD_Settings]) | |
run_seg.change(fn=toggle_visibility, inputs=[run_seg], outputs=[SS_Settings]) | |
run_depth.change(fn=toggle_visibility, inputs=[run_depth], outputs=[DE_Settings]) | |
blend = gr.Slider(0.0, 1.0, 0.5, label="Overlay Blend") | |
# Run Button | |
run = gr.Button("Run Analysis") | |
#Right panel | |
with gr.Column(scale=1): | |
# single_img_preview = gr.Image(label="Preview (Image)", visible=False) | |
# gallery_preview = gr.Gallery(label="Preview (Gallery)", columns=3, height="auto", visible=False) | |
# video_preview = gr.Video(label="Preview (Video)", visible=False) | |
img_out = gr.Image(label="Scene Blueprint") | |
json_out = gr.JSON(label="Scene JSON") | |
zip_out = gr.File(label="Download Results") | |
# # Output Tabs | |
# with gr.Tab("Scene JSON"): | |
# json_out = gr.JSON() | |
# with gr.Tab("Scene Blueprint"): | |
# img_out = gr.Image() | |
# with gr.Tab("Download"): | |
# zip_out = gr.File() | |
# Button Click Event | |
run.click( | |
handle, | |
inputs=[mode, media_upload, url, run_det, det_model, det_confidence, run_seg, seg_model, run_depth, depth_model, blend], | |
outputs=[img_out, json_out, zip_out] | |
) | |
# Footer Section | |
gr.Markdown("---") | |
gr.Markdown( | |
""" | |
<div style='text-align: center; font-size: 14px;'> | |
Built by <b>Durga Deepak Valluri</b><br> | |
<a href="https://github.com/DurgaDeepakValluri" target="_blank">GitHub</a> | | |
<a href="https://deecoded.io" target="_blank">Website</a> | | |
<a href="https://www.linkedin.com/in/durga-deepak-valluri" target="_blank">LinkedIn</a> | |
</div> | |
""", | |
) | |
# Launch the Gradio App | |
demo.launch() | |