File size: 5,359 Bytes
4e5eb13
 
 
452c890
34660db
452c890
 
 
 
 
 
 
 
 
 
 
 
 
 
34660db
 
452c890
34660db
948cba3
 
452c890
 
 
 
 
 
 
4e5eb13
948cba3
 
 
 
09d7cf1
f36044d
09d7cf1
f36044d
09d7cf1
 
f36044d
 
 
 
 
 
 
 
 
 
 
09d7cf1
f36044d
 
 
b471609
 
 
09d7cf1
948cba3
 
 
 
 
 
0123b5a
948cba3
 
 
34660db
948cba3
34660db
948cba3
34660db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
948cba3
 
34660db
 
09d7cf1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import pandas as pd

# Static data - reordered columns: Method, #Param., Input Type, Control Type, Model Type, Mean Traj. ↓, Acc. ↑
STATIC_DATA = [
    ["w/o WM", "72B", "RGB", "–", "VLM", 6.24, 50.27],
    ["PathDreamer [36]", "0.69B", "RGB-D; Sem; Pano", "Viewpoint", "Image Gen.", 5.28, 56.99],
    ["SE3DS [11]", "1.1B", "RGB-D; Pano", "Viewpoint", "Image Gen.", 5.29, 57.53],
    ["NWM [25]", "1B", "RGB", "Trajectory", "Video Gen.", 5.68, 57.35],
    ["SVD [6]", "1.5B", "RGB", "Image", "Video Gen.", 5.29, 57.71],
    ["LTX-Video [5]", "2B", "RGB", "Text", "Video Gen.", 5.37, 56.08],
    ["Hunyuan [4]", "13B", "RGB", "Text", "Video Gen.", 5.21, 57.71],
    ["Wan2.1 [23]", "14B", "RGB", "Text", "Video Gen.", 5.24, 58.26],
    ["Cosmos [1]", "2B", "RGB", "Text", "Video Gen.", 5.898, 52.27],
    ["Runway", "–", "–", "Text", "Video Gen.", "–", "–"],
    ["SVD† [6]", "1.5B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.02, 60.98],
    ["LTX† [5]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.49, 57.53],
    ["WAN2.1† [23]", "14B", "RGB; Pano", "Action", "Video Gen. Post-Train", "XXX", "XXX"],
    ["Cosmos† [1]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.08, 60.25],
]

COLUMNS = ["Method", "#Param.", "Input Type", "Control Type", "Model Type", "Mean Traj. ↓", "Acc. ↑"]

def create_leaderboard():
    df = pd.DataFrame(STATIC_DATA, columns=COLUMNS)
    # Sort by accuracy in descending order (highest first), handling non-numeric values
    df_clean = df.copy()
    # Replace non-numeric values with -1 for sorting (so they appear at bottom)
    df_clean['Acc. ↑'] = pd.to_numeric(df_clean['Acc. ↑'], errors='coerce').fillna(-1)
    df_sorted = df_clean.sort_values('Acc. ↑', ascending=False)
    # Return original df with the sorted order but original values
    return df.iloc[df_sorted.index].reset_index(drop=True)

with gr.Blocks(title="World-in-World: Building a Closed-Loop World Interface to Evaluate World Models", theme=gr.themes.Soft()) as demo:
    gr.HTML("<h1 style='text-align: center; margin-bottom: 1rem'>πŸ† World-in-World: Building a Closed-Loop World Interface to Evaluate World Models</h1>")
    
    with gr.Tabs():
        # New Zone before Leaderboard
        with gr.TabItem("πŸ§‘β€πŸ« Instruction & Environmental Feedback"):
            with gr.Row():
                with gr.Column(scale=1, min_width=300):  # Ensuring proper alignment
                    gr.HTML("<h3 style='text-align: center;'>Instruction:</h3>")
                    gr.Markdown("Navigate to the Toaster in the room and be as close as possible to it.")
                    gr.Markdown("""
                    **Environment Step 4-7:**

                    **Planning:**
                    1. Move leftward by 0.25.
                    2. Move leftward by 0.25.
                    3. Move forward by 0.25.
                    4. Move forward by 0.25.
                    """)
                
                with gr.Column(scale=2, min_width=600):
                    gr.HTML("<h3 style='text-align: center;'>Closed-Loop Environmental Feedback</h3>")
                    # Adjust image/video sizes for better layout
                    gr.Video("/home/user/app/demo_source_data/AR/FTwan21_lora/X7HyMhZNoso/E145/A001/world_model_gen/bbox_gen_video_1.mp4", label="Left Image", width=480)
                    gr.Image("/home/user/app/demo_source_data/AR/FTwan21_lora/X7HyMhZNoso/E145/A001/real_obs.png", label="Bird Eye View", type="pil", width=480)
                    gr.HTML("<h4 style='text-align: center;'>3D Scene:</h4>")
                    with gr.Column(scale=1):
                        gr.Model3D("/home/user/app/demo_source_data/scenes_glb/5ZKStnWn8Zo.glb", label="3D Scene")

        with gr.TabItem("πŸ“Š Leaderboard"):
            leaderboard_table = gr.DataFrame(
                value=create_leaderboard(),
                headers=COLUMNS,
                datatype=["str", "str", "str", "str", "str", "number", "number"],
                interactive=False,
                wrap=True
            )
        
        with gr.TabItem("πŸ“ About"):
            gr.Markdown("""
            # World-in-World: Building a Closed-Loop World Interface to Evaluate World Models
            
            This leaderboard showcases performance metrics across different types of AI models in world modeling tasks:
            
            ## Model Categories
            - **VLM**: Vision-Language Models
            - **Image Gen.**: Image Generation Models  
            - **Video Gen.**: Video Generation Models
            - **Video Gen. Post-Train**: Post-training specialized Video Generation Models
            
            ## Metrics Explained
            - **Acc. ↑**: Accuracy score (higher values indicate better performance)
            - **Mean Traj. ↓**: Mean trajectory error (lower values indicate better performance)
            
            ## Notes
            - † indicates post-training specialized models
            - XXX indicates results pending/unavailable
            - – indicates not applicable or not available
            
            *Results represent performance on world modeling evaluation benchmarks and may vary across different evaluation settings.*
            """)

if __name__ == "__main__":
    demo.launch()