galb-dai commited on
Commit
5662cd4
·
1 Parent(s): e5e5305
Files changed (1) hide show
  1. src/about.py +10 -0
src/about.py CHANGED
@@ -44,6 +44,8 @@ WHAT_IS_F1_HTML_TOP = f"""
44
  """
45
 
46
  # Bottom is split so we can insert real Gradio media (images/video) from app.py.
 
 
47
  WHAT_IS_F1_HTML_BOTTOM_A = """
48
  <div class="f1-container">
49
  <section>
@@ -61,11 +63,13 @@ WHAT_IS_F1_HTML_BOTTOM_A = """
61
  <!-- bag_modifications figure inserted via gr.Image in app.py -->
62
  """
63
 
 
64
  WHAT_IS_F1_HTML_BOTTOM_B = """
65
  <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
66
  <!-- Video inserted via gr.Video in app.py -->
67
  """
68
 
 
69
  WHAT_IS_F1_HTML_AFTER_VIDEO = """
70
  <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem &mdash; <code>Maximal-Cluster-Graph</code> &mdash; <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
71
  </section>
@@ -81,17 +85,23 @@ WHAT_IS_F1_HTML_AFTER_VIDEO = """
81
  </ul>
82
  <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
83
  <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems.</p>
 
84
 
 
 
 
85
  <h2 class="f1-h2">Model Accuracy</h2>
86
  <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
87
  <!-- warmup_performance figure inserted via gr.Image in app.py -->
88
  """
89
 
 
90
  WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
91
  <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
92
  <!-- tier1_performance figure inserted via gr.Image in app.py -->
93
  """
94
 
 
95
  WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
96
  <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
97
  </section>
 
44
  """
45
 
46
  # Bottom is split so we can insert real Gradio media (images/video) from app.py.
47
+
48
+ # Up to before the first figure (bag_modifications.png)
49
  WHAT_IS_F1_HTML_BOTTOM_A = """
50
  <div class="f1-container">
51
  <section>
 
63
  <!-- bag_modifications figure inserted via gr.Image in app.py -->
64
  """
65
 
66
+ # After the first figure, before the video
67
  WHAT_IS_F1_HTML_BOTTOM_B = """
68
  <p class="mb-4 f1-p">An algorithm can then traverse this tree of bags, solving the problem piece by piece using dynamic programming. This process involves designing a “state” that summarises all necessary information about the partial solution within a bag, and then defining how this state transforms as vertices are introduced, forgotten, or bags are merged.</p>
69
  <!-- Video inserted via gr.Video in app.py -->
70
  """
71
 
72
+ # Text immediately after the video; opens Evaluation section header/content (up to before Warmup figure)
73
  WHAT_IS_F1_HTML_AFTER_VIDEO = """
74
  <p class="f1-p">The deceptive simplicity of the problem statements belies the <strong>extraordinary difficulty</strong> of discovering the correct dynamic programming solution. This process is riddled with subtle combinatorial and logical pitfalls, demanding a profound understanding of the problem’s underlying structure. For a detailed walkthrough of the fifteen interdependent reasoning steps required to solve a single hard problem &mdash; <code>Maximal-Cluster-Graph</code> &mdash; <a href="https://arxiv.org/pdf/2507.13337#appendix.A" target="_blank" rel="noopener noreferrer" class="f1-a">see the appendix of our paper</a>.</p>
75
  </section>
 
85
  </ul>
86
  <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
87
  <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems.</p>
88
+ """
89
 
90
+ # *** THIS WAS MISSING BEFORE ***
91
+ # Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
92
+ WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG = """
93
  <h2 class="f1-h2">Model Accuracy</h2>
94
  <p class="mb-4 f1-p">On the <strong>FormulaOne-Warmup</strong> problems, frontier models perform reasonably well. This confirms they have a foundational capability for these types of algorithmic tasks.</p>
95
  <!-- warmup_performance figure inserted via gr.Image in app.py -->
96
  """
97
 
98
+ # Between Warmup and Tier 1 figures
99
  WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
100
  <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
101
  <!-- tier1_performance figure inserted via gr.Image in app.py -->
102
  """
103
 
104
+ # Tail after Tier 1 figure (closes evaluation section + container)
105
  WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
106
  <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
107
  </section>