File size: 7,287 Bytes
4e5eb13
 
 
452c890
34660db
452c890
 
 
 
 
 
 
 
 
 
 
 
 
 
34660db
 
452c890
34660db
948cba3
 
452c890
 
 
 
 
 
 
4e5eb13
948cba3
 
 
 
a652572
09d7cf1
a652572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f36044d
 
a652572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09d7cf1
948cba3
 
 
 
 
 
0123b5a
948cba3
 
 
34660db
948cba3
34660db
948cba3
34660db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948cba3
 
34660db
 
a652572
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import pandas as pd

# Static data - reordered columns: Method, #Param., Input Type, Control Type, Model Type, Mean Traj. ↓, Acc. ↑
STATIC_DATA = [
    ["w/o WM", "72B", "RGB", "–", "VLM", 6.24, 50.27],
    ["PathDreamer [36]", "0.69B", "RGB-D; Sem; Pano", "Viewpoint", "Image Gen.", 5.28, 56.99],
    ["SE3DS [11]", "1.1B", "RGB-D; Pano", "Viewpoint", "Image Gen.", 5.29, 57.53],
    ["NWM [25]", "1B", "RGB", "Trajectory", "Video Gen.", 5.68, 57.35],
    ["SVD [6]", "1.5B", "RGB", "Image", "Video Gen.", 5.29, 57.71],
    ["LTX-Video [5]", "2B", "RGB", "Text", "Video Gen.", 5.37, 56.08],
    ["Hunyuan [4]", "13B", "RGB", "Text", "Video Gen.", 5.21, 57.71],
    ["Wan2.1 [23]", "14B", "RGB", "Text", "Video Gen.", 5.24, 58.26],
    ["Cosmos [1]", "2B", "RGB", "Text", "Video Gen.", 5.898, 52.27],
    ["Runway", "–", "–", "Text", "Video Gen.", "–", "–"],
    ["SVD† [6]", "1.5B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.02, 60.98],
    ["LTX† [5]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.49, 57.53],
    ["WAN2.1† [23]", "14B", "RGB; Pano", "Action", "Video Gen. Post-Train", "XXX", "XXX"],
    ["Cosmos† [1]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.08, 60.25],
]

COLUMNS = ["Method", "#Param.", "Input Type", "Control Type", "Model Type", "Mean Traj. ↓", "Acc. ↑"]

def create_leaderboard():
    df = pd.DataFrame(STATIC_DATA, columns=COLUMNS)
    # Sort by accuracy in descending order (highest first), handling non-numeric values
    df_clean = df.copy()
    # Replace non-numeric values with -1 for sorting (so they appear at bottom)
    df_clean['Acc. ↑'] = pd.to_numeric(df_clean['Acc. ↑'], errors='coerce').fillna(-1)
    df_sorted = df_clean.sort_values('Acc. ↑', ascending=False)
    # Return original df with the sorted order but original values
    return df.iloc[df_sorted.index].reset_index(drop=True)

with gr.Blocks(title="World-in-World: Building a Closed-Loop World Interface to Evaluate World Models", theme=gr.themes.Soft()) as demo:
    gr.HTML("<h1 style='text-align: center; margin-bottom: 1rem'>πŸ† World-in-World: Building a Closed-Loop World Interface to Evaluate World Models</h1>")
    
    with gr.Tabs():
        with gr.TabItem("πŸ§‘β€πŸ« Interactive Demo"):
            with gr.Row():
                # Left Zone: Instructions
                with gr.Column(scale=2, min_width=350):
                    # Mimicking the blue instruction box from the image
                    gr.HTML("""
                        <div style='background-color: #e6f3ff; border: 1px solid #b3d9ff; border-radius: 8px; padding: 15px; font-family: sans-serif;'>
                            <div style='display: flex; align-items: center; margin-bottom: 10px;'>
                                <span style='font-size: 24px; margin-right: 10px;'>🧠</span>
                                <h3 style='margin: 0; color: #333;'>Instruction:</h3>
                            </div>
                            <p style='margin: 0; color: #555;'>Navigate to the Toaster in the room and be as close as possible to it.</p>
                        </div>
                    """)
                    # Mimicking the grey planning box from the image
                    gr.HTML("""
                        <div style='background-color: #f5f5f5; border: 1px solid #e0e0e0; border-radius: 8px; padding: 15px; margin-top: 20px; font-family: sans-serif;'>
                            <div style='display: flex; align-items: center; margin-bottom: 10px;'>
                                <span style='font-size: 24px; margin-right: 10px;'>🦾</span>
                                <h3 style='margin: 0; color: #333;'>Environment Step 4-7:</h3>
                            </div>
                            <h4 style='margin-top: 10px; margin-bottom: 5px; color: #444;'>Planning:</h4>
                            <ol start="4" style='padding-left: 20px; margin: 0; color: #555;'>
                                <li>Move leftward by 0.25.</li>
                                <li>Move leftward by 0.25.</li>
                                <li>Move forward by 0.25.</li>
                                <li>Move forward by 0.25.</li>
                            </ol>
                        </div>
                    """)
                
                # Middle Zone: Closed-Loop Environmental Feedback
                with gr.Column(scale=4, min_width=400):
                    gr.HTML("<h2 style='text-align: center; color: #db83b5;'>Closed-Loop Environmental Feedback</h2>")
                    with gr.Row():
                        # The single video on the left of this zone
                        gr.Video("/home/user/app/demo_source_data/AR/FTwan21_lora/X7HyMhZNoso/E145/A001/world_model_gen/bbox_gen_video_1.mp4", label="First Person View", interactive=False)
                        # The column with the other two views on the right
                        with gr.Column():
                             gr.Image("/home/user/app/demo_source_data/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A000/real_obs_bbox.png", label="Bird's Eye View", type="pil", interactive=False)
                             gr.Model3D("/home/user/app/demo_source_data/scenes_glb/5ZKStnWn8Zo.glb", label="3D Scene", interactive=False)
                
                # Right Zone: World Model's Generation
                with gr.Column(scale=3, min_width=300):
                    gr.HTML("<h2 style='text-align: center;'>World Model's Generation</h2>")
                    gr.Video("/home/user/app/demo_source_data/AR/FTwan21_lora/X7HyMhZNoso/E145/A001/world_model_gen/gen_video_1.mp4", label="Generated View", interactive=False)


        with gr.TabItem("πŸ“Š Leaderboard"):
            leaderboard_table = gr.DataFrame(
                value=create_leaderboard(),
                headers=COLUMNS,
                datatype=["str", "str", "str", "str", "str", "number", "number"],
                interactive=False,
                wrap=True
            )
        
        with gr.TabItem("πŸ“ About"):
            gr.Markdown("""
            # World-in-World: Building a Closed-Loop World Interface to Evaluate World Models
            
            This leaderboard showcases performance metrics across different types of AI models in world modeling tasks:
            
            ## Model Categories
            - **VLM**: Vision-Language Models
            - **Image Gen.**: Image Generation Models  
            - **Video Gen.**: Video Generation Models
            - **Video Gen. Post-Train**: Post-training specialized Video Generation Models
            
            ## Metrics Explained
            - **Acc. ↑**: Accuracy score (higher values indicate better performance)
            - **Mean Traj. ↓**: Mean trajectory error (lower values indicate better performance)
            
            ## Notes
            - † indicates post-training specialized models
            - XXX indicates results pending/unavailable
            - – indicates not applicable or not available
            
            *Results represent performance on world modeling evaluation benchmarks and may vary across different evaluation settings.*
            """)

if __name__ == "__main__":
    demo.launch()