Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
# Static data - reordered columns: Method, #Param., Input Type, Control Type, Model Type, Mean Traj. β, Acc. β | |
STATIC_DATA = [ | |
["w/o WM", "72B", "RGB", "β", "VLM", 6.24, 50.27], | |
["PathDreamer [36]", "0.69B", "RGB-D; Sem; Pano", "Viewpoint", "Image Gen.", 5.28, 56.99], | |
["SE3DS [11]", "1.1B", "RGB-D; Pano", "Viewpoint", "Image Gen.", 5.29, 57.53], | |
["NWM [25]", "1B", "RGB", "Trajectory", "Video Gen.", 5.68, 57.35], | |
["SVD [6]", "1.5B", "RGB", "Image", "Video Gen.", 5.29, 57.71], | |
["LTX-Video [5]", "2B", "RGB", "Text", "Video Gen.", 5.37, 56.08], | |
["Hunyuan [4]", "13B", "RGB", "Text", "Video Gen.", 5.21, 57.71], | |
["Wan2.1 [23]", "14B", "RGB", "Text", "Video Gen.", 5.24, 58.26], | |
["Cosmos [1]", "2B", "RGB", "Text", "Video Gen.", 5.898, 52.27], | |
["Runway", "β", "β", "Text", "Video Gen.", "β", "β"], | |
["SVDβ [6]", "1.5B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.02, 60.98], | |
["LTXβ [5]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.49, 57.53], | |
["WAN2.1β [23]", "14B", "RGB; Pano", "Action", "Video Gen. Post-Train", "XXX", "XXX"], | |
["Cosmosβ [1]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.08, 60.25], | |
] | |
COLUMNS = ["Method", "#Param.", "Input Type", "Control Type", "Model Type", "Mean Traj. β", "Acc. β"] | |
def create_leaderboard(): | |
df = pd.DataFrame(STATIC_DATA, columns=COLUMNS) | |
# Sort by accuracy in descending order (highest first), handling non-numeric values | |
df_clean = df.copy() | |
# Replace non-numeric values with -1 for sorting (so they appear at bottom) | |
df_clean['Acc. β'] = pd.to_numeric(df_clean['Acc. β'], errors='coerce').fillna(-1) | |
df_sorted = df_clean.sort_values('Acc. β', ascending=False) | |
# Return original df with the sorted order but original values | |
return df.iloc[df_sorted.index].reset_index(drop=True) | |
with gr.Blocks(title="World-in-World: Building a Closed-Loop World Interface to Evaluate World Models", theme=gr.themes.Soft()) as demo: | |
gr.HTML("<h1 style='text-align: center; margin-bottom: 1rem'>π World-in-World: Building a Closed-Loop World Interface to Evaluate World Models</h1>") | |
with gr.Tabs(): | |
with gr.TabItem("π§βπ« Interactive Demo"): | |
with gr.Row(): | |
# Left Zone: Agent's View | |
with gr.Column(scale=2, min_width=350): | |
gr.HTML("<h2 style='text-align: center;'>Agent's View</h2>") | |
# Mimicking the blue instruction box from the image | |
gr.HTML(""" | |
<div style='background-color: #e6f3ff; border: 1px solid #b3d9ff; border-radius: 8px; padding: 15px; font-family: sans-serif;'> | |
<div style='display: flex; align-items: center; margin-bottom: 10px;'> | |
<span style='font-size: 24px; margin-right: 10px;'>π§ </span> | |
<h3 style='margin: 0; color: #333;'>Instruction:</h3> | |
</div> | |
<p style='margin: 0; color: #555;'>Navigate to the Toaster in the room and be as close as possible to it.</p> | |
</div> | |
""") | |
# Mimicking the grey planning box from the image | |
gr.HTML(""" | |
<div style='background-color: #f5f5f5; border: 1px solid #e0e0e0; border-radius: 8px; padding: 15px; margin-top: 20px; font-family: sans-serif;'> | |
<div style='display: flex; align-items: center; margin-bottom: 10px;'> | |
<span style='font-size: 24px; margin-right: 10px;'>π¦Ύ</span> | |
<h3 style='margin: 0; color: #333;'>Environment Step 4-7:</h3> | |
</div> | |
<h4 style='margin-top: 10px; margin-bottom: 5px; color: #444;'>Planning:</h4> | |
<ol start="4" style='padding-left: 20px; margin: 0; color: #555;'> | |
<li>Move leftward by 0.25.</li> | |
<li>Move leftward by 0.25.</li> | |
<li>Move forward by 0.25.</li> | |
<li>Move forward by 0.25.</li> | |
</ol> | |
</div> | |
""") | |
# Middle Zone: Closed-Loop Environmental Feedback | |
with gr.Column(scale=4, min_width=500): | |
gr.HTML("<h2 style='text-align: center; color: #db83b5;'>Closed-Loop Environmental Feedback</h2>") | |
with gr.Row(): | |
gr.Video("/home/user/app/demo_source_data/AR/FTwan21_lora/X7HyMhZNoso/E145/A001/world_model_gen/bbox_gen_video_1.mp4", label="First Person View", interactive=False) | |
gr.Image("/home/user/app/demo_source_data/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A000/real_obs_bbox.png", label="Bird's Eye View", type="pil", interactive=False) | |
gr.Model3D("/home/user/app/demo_source_data/scenes_glb/5ZKStnWn8Zo.glb", label="3D Scene", interactive=False) | |
# Right Zone: World Model's Generation | |
with gr.Column(scale=3, min_width=400): | |
gr.HTML("<h2 style='text-align: center;'>World Model's Generation</h2>") | |
# Using the new video path provided by the user | |
gr.Video("/home/user/app/demo_source_data/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A005/world_model_gen/obj_centered_gen_video_1.mp4", label="Generated View", interactive=False) | |
with gr.TabItem("π Leaderboard"): | |
leaderboard_table = gr.DataFrame( | |
value=create_leaderboard(), | |
headers=COLUMNS, | |
datatype=["str", "str", "str", "str", "str", "number", "number"], | |
interactive=False, | |
wrap=True | |
) | |
with gr.TabItem("π About"): | |
gr.Markdown(""" | |
# World-in-World: Building a Closed-Loop World Interface to Evaluate World Models | |
This leaderboard showcases performance metrics across different types of AI models in world modeling tasks: | |
## Model Categories | |
- **VLM**: Vision-Language Models | |
- **Image Gen.**: Image Generation Models | |
- **Video Gen.**: Video Generation Models | |
- **Video Gen. Post-Train**: Post-training specialized Video Generation Models | |
## Metrics Explained | |
- **Acc. β**: Accuracy score (higher values indicate better performance) | |
- **Mean Traj. β**: Mean trajectory error (lower values indicate better performance) | |
## Notes | |
- β indicates post-training specialized models | |
- XXX indicates results pending/unavailable | |
- β indicates not applicable or not available | |
*Results represent performance on world modeling evaluation benchmarks and may vary across different evaluation settings.* | |
""") | |
if __name__ == "__main__": | |
demo.launch() |