import json import base64 import io import html from PIL import Image def image_to_base64_url(image: str | Image.Image): if isinstance(image, str): with open(image, "rb") as f: image = f.read() elif isinstance(image, Image.Image): if image.mode in ("RGBA", "LA"): image = image.convert("RGB") with io.BytesIO() as buffer: image.save(buffer, format="PNG") image = buffer.getvalue() else: raise ValueError(f"Invalid image type: {type(image)}") return "data:image/png;base64," + base64.b64encode(image).decode("utf-8") def load_json(file_path: str) -> dict: with open(file_path, "r") as f: return json.load(f) def save_json(data: dict, file_path: str): with open(file_path, "w") as f: json.dump(data, f, indent=4) def str_to_bool(s: str) -> bool: if s.lower() in ["true", "1", "yes", "y"]: return True elif s.lower() in ["false", "0", "no", "n"]: return False else: raise ValueError(f"Invalid boolean string: {s}") def create_html_report(json_path, html_path, checklist_generation=False): """ Reads the given JSON result file and generates a filterable HTML report. Args: json_path (str): Path to the input JSON file. html_path (str): Path to the output HTML file. """ try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) except FileNotFoundError: print(f"Error: JSON file not found - {json_path}") # Error message in English return except json.JSONDecodeError: print(f"Error: JSON file parsing error - {json_path}") # Error message in English return except Exception as e: print(f"Unexpected error during data loading: {e}") # Error message in English return # Extract unique Task IDs and sort them task_ids = sorted(list(set(item.get("task_id") for item in data if item.get("task_id") is not None))) html_content = """ Benchmark Results Report

Benchmark Results Report

""" # Process each Task/Step data for i, step_data in enumerate(data): task_id = step_data.get("task_id", "N/A") step_id = step_data.get("step_id", "N/A") intent = step_data.get("intent", "N/A") start_url = step_data.get("start_url", "N/A") gt_checklist = step_data.get("gt_checklist", "N/A") generated_checklist = step_data.get("generated_checklist", None) trajectory = step_data.get("trajectory", "N/A") text_observation = step_data.get("text_observation", "N/A") source_name = step_data.get("source_name", "") # Wrap each Task/Step in a container with a unique ID (hidden initially) html_content += f"""

Task ID: {html.escape(str(task_id))} | Step ID: {html.escape(str(step_id))} {f'({html.escape(source_name)})' if source_name else ''}

Intent:

{html.escape(intent)}

Start URL: {html.escape(start_url)}

Ground Truth Checklist:

{html.escape(gt_checklist)}
""" if checklist_generation and generated_checklist is not None: html_content += f"""
Generated Checklist (Click to expand/collapse)
{html.escape(str(generated_checklist))}
""" html_content += f"""
Trajectory (Click to expand/collapse)
{html.escape(trajectory)}
Text Observation (Click to expand/collapse)
{html.escape(text_observation)}

""" # Chosen Responses if 'chosen' in step_data and step_data['chosen']: html_content += '

Chosen Responses:

' for choice_block in step_data['chosen']: thought = choice_block.get('thought', 'N/A') action = choice_block.get('action', 'N/A') responses = choice_block.get('response', []) scores = choice_block.get('score', []) # Add Thought and Action information html_content += f"""

Thought:

{html.escape(thought)}

Action:

{html.escape(action)}
""" # Loop through responses and create toggles for idx, (response, score) in enumerate(zip(responses, scores)): html_content += f"""
Judge Response {idx + 1}: {html.escape(str(score))}
{html.escape(str(response))}
""" html_content += '
' # End chosen-section # Rejected Responses if 'rejected' in step_data and step_data['rejected']: html_content += '

Rejected Responses:

' for rejection_block in step_data['rejected']: thought = rejection_block.get('thought', 'N/A') action = rejection_block.get('action', 'N/A') responses = rejection_block.get('response', []) scores = rejection_block.get('score', []) # Add Thought and Action information html_content += f"""

Thought:

{html.escape(thought)}

Action:

{html.escape(action)}
""" # Loop through responses and create toggles for idx, (response, score) in enumerate(zip(responses, scores)): html_content += f"""
Judge Response {idx + 1}: {html.escape(str(score))}
{html.escape(str(response))}
""" html_content += '
' # End rejected-section html_content += """
""" # Finalize HTML and add JavaScript html_content += """
""" # Save the HTML file try: with open(html_path, 'w', encoding='utf-8') as f: f.write(html_content) print(f"Completed: HTML report created at {html_path}") except IOError: print(f"Error: Failed to write HTML file - {html_path}") except Exception as e: print(f"Unexpected error during HTML file saving: {e}") # --- Example Usage --- # input_json_file = 'path/to/your/results.json' # output_html_file = 'trajectory_report.html' # create_html_report(input_json_file, output_html_file)