Benchmark Results Report

""" # Process each Task/Step data for i, step_data in enumerate(data): task_id = step_data.get("task_id", "N/A") step_id = step_data.get("step_id", "N/A") intent = step_data.get("intent", "N/A") start_url = step_data.get("start_url", "N/A") gt_checklist = step_data.get("gt_checklist", "N/A") generated_checklist = step_data.get("generated_checklist", None) trajectory = step_data.get("trajectory", "N/A") text_observation = step_data.get("text_observation", "N/A") source_name = step_data.get("source_name", "") # Wrap each Task/Step in a container with a unique ID (hidden initially) html_content += f"""

Task ID: {html.escape(str(task_id))} | Step ID: {html.escape(str(step_id))} {f'({html.escape(source_name)})' if source_name else ''}

Intent:

{html.escape(intent)}

Start URL: {html.escape(start_url)}

Ground Truth Checklist:

{html.escape(gt_checklist)}

""" if checklist_generation and generated_checklist is not None: html_content += f"""

Generated Checklist (Click to expand/collapse)

{html.escape(str(generated_checklist))}

""" html_content += f"""

Trajectory (Click to expand/collapse)

{html.escape(trajectory)}

Text Observation (Click to expand/collapse)

{html.escape(text_observation)}

""" # Chosen Responses if 'chosen' in step_data and step_data['chosen']: html_content += '

Chosen Responses:

' for choice_block in step_data['chosen']: thought = choice_block.get('thought', 'N/A') action = choice_block.get('action', 'N/A') responses = choice_block.get('response', []) scores = choice_block.get('score', []) # Add Thought and Action information html_content += f"""

Thought:

{html.escape(thought)}

Action:

{html.escape(action)}

""" # Loop through responses and create toggles for idx, (response, score) in enumerate(zip(responses, scores)): html_content += f"""

Judge Response {idx + 1}: {html.escape(str(score))}

{html.escape(str(response))}

""" html_content += '

' # End chosen-section # Rejected Responses if 'rejected' in step_data and step_data['rejected']: html_content += '

Rejected Responses:

' for rejection_block in step_data['rejected']: thought = rejection_block.get('thought', 'N/A') action = rejection_block.get('action', 'N/A') responses = rejection_block.get('response', []) scores = rejection_block.get('score', []) # Add Thought and Action information html_content += f"""

Thought:

{html.escape(thought)}

Action:

{html.escape(action)}

""" # Loop through responses and create toggles for idx, (response, score) in enumerate(zip(responses, scores)): html_content += f"""

Judge Response {idx + 1}: {html.escape(str(score))}

{html.escape(str(response))}

""" html_content += '

' # End rejected-section html_content += """

""" # Finalize HTML and add JavaScript html_content += """