import json import os import re from datetime import datetime from typing import Tuple import pandas as pd from bs4 import BeautifulSoup def format_datetime(dt_str: str) -> str: """ Format a datetime string for display. :param dt_str: String representing a datetime in ISO format :return: Formatted datetime string """ return dt_str.replace("T", " ").split("+")[0] def read_json_line_by_line(file_path, commit_hash=None): """ Read a JSON file line by line, parsing each line as a separate JSON object. Optionally filter by commit_hash if provided. :param file_path: Path to the JSON file :param commit_hash: Optional commit hash to filter data :return: List of parsed JSON objects """ data = [] with open(file_path, "r") as f: for line in f: try: item = json.loads(line.strip()) # Filter by commit_hash if provided if commit_hash is None or item.get("commit_hash") == commit_hash: data.append(item) except json.JSONDecodeError: print(f"Skipping invalid JSON in {file_path}: {line}") return data def calculate_change(new: float, old: float, metric_name: str) -> Tuple[float, str]: """Calculate percentage change and return with appropriate emoji.""" pct_change = new - old if abs(pct_change) < 1: emoji = "↔️" elif pct_change > 0: emoji = "🟢" if "wer" not in metric_name.lower() else "❌" else: emoji = "❌" if "wer" not in metric_name.lower() else "🟢" return (pct_change, emoji) def has_changes(config, prev_dict, curr_dict): """Check if any metrics have changed.""" curr = curr_dict[config] prev = prev_dict[config] metrics = ["speed", "tokens_per_second", "average_wer", "qoi"] for key in metrics: if key in curr and key in prev: curr_val = curr[key] prev_val = prev[key] if abs(curr_val - prev_val) >= 1: # 1% threshold return True return False def format_metrics_table(config, prev_dict, curr_dict, improved, regressed): """Format metrics into a table string and track improvements/regressions.""" curr = curr_dict[config] prev = prev_dict[config] # improved = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0} # regressed = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0} metrics = [ ("Speed", "speed"), ("Tok/s", "tokens_per_second"), ("WER", "average_wer"), ("QoI", "qoi"), ] table = "```\nMetric Previous Current Change\n--------------------------------\n" for metric_name, key in metrics: if key in curr and key in prev: curr_val = curr[key] prev_val = prev[key] pct_change, _ = calculate_change(curr_val, prev_val, metric_name) if abs(pct_change) >= 1: # Only show metrics with changes table += f"{metric_name:<9} {prev_val:<11.2f} {curr_val:<10.2f} {pct_change:.2f}\n" # Track improvements/regressions if pct_change > 0: if "wer" not in metric_name.lower(): improved[key] += 1 else: regressed[key] += 1 else: if "wer" not in metric_name.lower(): regressed[key] += 1 else: improved[key] += 1 table += "```" return table def extract_status_and_os(cell_value): """ Extract status and OS versions from a cell, handling both HTML and plain text. Returns list of tuples: [(status, os_version), ...] """ results = [] cell_value = str(cell_value) # First, handle the case where there's no HTML tags if cell_value == "Not Supported": return results # Split the cell into parts (first element and subsequent

elements) parts = cell_value.split("

") for part in parts: part = part.strip("

") if not part: continue # Check if part contains warning symbol if "⚠️" in part: # Parse HTML to extract OS version from anchor tag soup = BeautifulSoup(part, "html.parser") # Find text after href that contains OS version text = soup.get_text() os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", text) if os_match: os_version = os_match.group(0) results.append(("⚠️", os_version)) else: # For success cases, OS version is directly in the text os_match = re.search(r"(iOS|iPadOS|macOS)\s+[\d.]+", part) if os_match: os_version = os_match.group(0) results.append(("✅", os_version)) return results def escape_string(s: str) -> str: """Escape a string to be used as a value in JSON.""" return ( s.replace("\\", "\\\\") .replace('"', '\\"') .replace("\n", "\\n") .replace("\r", "\\r") ) def analyze_support_changes(prev_csv, curr_csv): """Analyze support changes between CSV files.""" # Read CSV files prev_df = pd.read_csv(prev_csv) prev_df.set_index(prev_df.columns[0], inplace=True) curr_df = pd.read_csv(curr_csv) curr_df.set_index(curr_df.columns[0], inplace=True) # Get device lists (excluding first column which is the index) prev_devices = sorted(prev_df.columns[1:]) curr_devices = sorted(curr_df.columns[1:]) # Calculate device ratio device_ratio = len(curr_devices) / len(prev_devices) if prev_devices else 1 needs_alert = device_ratio < 0.9 # Alert if less than 90% of previous devices # Convert to dictionary for easier comparison prev_status = {} curr_status = {} # Process previous data for idx in range(len(prev_df)): model = prev_df.index[idx] for col_idx in range(1, len(prev_df.columns)): cell_value = prev_df.iloc[idx, col_idx] device = prev_df.columns[col_idx] statuses = extract_status_and_os(cell_value) for status, os_version in statuses: prev_status[(model, device, os_version)] = status # Process current data and track new configurations new_configs = [] for idx in range(len(curr_df)): model = curr_df.index[idx] for col_idx in range(1, len(curr_df.columns)): cell_value = curr_df.iloc[idx, col_idx] device = curr_df.columns[col_idx] statuses = extract_status_and_os(cell_value) for status, os_version in statuses: curr_status[(model, device, os_version)] = status # Check if this is a new configuration if (model, device, os_version) not in prev_status: new_configs.append((model, device, os_version)) # Find changes fixed_errors = [] new_errors = [] # Check all configurations that exist in both datasets common_configs = set(prev_status.keys()) & set(curr_status.keys()) for config in common_configs: model, device, os_version = config if prev_status[config] == "⚠️" and curr_status[config] == "✅": fixed_errors.append((model, device, os_version)) elif prev_status[config] == "✅" and curr_status[config] == "⚠️": new_errors.append((model, device, os_version)) return fixed_errors, new_errors, new_configs, needs_alert def generate_report(): # Load version data first to get commit hashes with open("report_data/version.json", "r") as f: version_data = json.load(f) # Get the last two commit hashes from releases array releases = version_data.get("releases", []) if len(releases) >= 2: curr_commit_hash = releases[-1] # latest commit prev_commit_hash = releases[-2] # previous commit else: curr_commit_hash = releases[-1] if releases else "" prev_commit_hash = "" # Load and filter performance data by commit hash prev_perf_data = read_json_line_by_line("dashboard_data/performance_data.json", commit_hash=prev_commit_hash) curr_perf_data = read_json_line_by_line("report_data/performance_data.json", commit_hash=curr_commit_hash) prev_dict = {(d["model"], d["device"], d["os"]): d for d in prev_perf_data} curr_dict = {(d["model"], d["device"], d["os"]): d for d in curr_perf_data} common_configs = set(curr_dict.keys()) & set(prev_dict.keys()) # Load version data with open("dashboard_data/version.json", "r") as f: prev_version = json.load(f) with open("report_data/version.json", "r") as f: curr_version = json.load(f) prev_releases = set(prev_version.get("releases", [])) curr_releases = set(curr_version.get("releases", [])) new_releases = curr_releases - prev_releases removed_releases = prev_releases - curr_releases # Track metrics improved_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0} regressed_metrics = {"speed": 0, "tokens_per_second": 0, "average_wer": 0, "qoi": 0} new_data_points = len(set(curr_dict.keys()) - set(prev_dict.keys())) # Analyze support changes fixed_errors, new_errors, new_configs, needs_alert = analyze_support_changes( "report_data/support_data.csv", "dashboard_data/support_data.csv" ) # Create Slack blocks current_time = datetime.now().strftime("%B %-d, %Y %H:%M:%S") prev_release_tag, curr_release_tag = ( prev_version["versions"][-1] if prev_version["versions"] else "N/A", curr_version["versions"][-1], ) slack_blocks = { "blocks": [ { "type": "header", "text": { "type": "plain_text", "text": "🔔 WhisperKit Dataset Update Report 🔔", "emoji": True, }, }, { "type": "context", "elements": [{"text": f"*{current_time}*", "type": "mrkdwn"}], }, {"type": "divider"}, { "type": "section", "text": {"type": "mrkdwn", "text": "ℹ️ *CURRENT VERSION INFO* ℹ️"}, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Last Modified:* `{format_datetime(curr_version['last_modified'])}`", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Dataset SHA:* `{curr_version['sha']}`", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Current Releases:* {', '.join(f'`{r}`' for r in curr_version['releases'])}", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Current Release Tag:* `{curr_release_tag}`", }, }, {"type": "divider"}, { "type": "section", "text": { "type": "mrkdwn", "text": "🔄 *SUMMARY OF PERFORMANCE UPDATES* 🔄", }, }, ] } # Add release information slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Added Releases:* {', '.join(sorted(new_releases)) if new_releases else 'None'}", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Removed Releases:* {', '.join(sorted(removed_releases)) if removed_releases else 'None'}", }, }, ] ) if prev_release_tag != curr_release_tag: slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"• *Release Tag Change:* `{prev_release_tag}` → `{curr_release_tag}`", }, } ) slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "\n", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": f"• *New Data Points:* `{new_data_points}` new configurations", }, }, { "type": "section", "text": { "type": "mrkdwn", "text": "\n", }, }, ] ) # Create performance text as a single mrkdwn string if common_configs: performance_text = "💡 *Performance Updates* 💡\n\n" # Group by model for better organization models = sorted(set(model for model, _, _ in common_configs)) for model in models: model_configs = sorted([cfg for cfg in common_configs if cfg[0] == model]) for config in model_configs: device_info = f"*{model}* ({config[2]})" if not has_changes(config, prev_dict, curr_dict): # If no changes, just add the model with a checkmark performance_text += f"{device_info} ✅\n\n" else: # If there are changes, show the metrics performance_text += f"{device_info}\n" table = format_metrics_table(config, prev_dict, curr_dict, improved_metrics, regressed_metrics) performance_text += table performance_text += "\n\n" # Add metrics summary for metric_name, key in [ ("Speed", "speed"), ("Tok/s", "tokens_per_second"), ("WER", "average_wer"), ("QoI", "qoi"), ]: slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"• *{metric_name}:* `{improved_metrics[key]}` improved, `{regressed_metrics[key]}` regressed", }, } ) # Add support changes section if fixed_errors or new_errors or new_configs: slack_blocks["blocks"].extend( [ {"type": "divider"}, { "type": "section", "text": {"type": "mrkdwn", "text": "📱 *DEVICE SUPPORT CHANGES* 📱"}, }, ] ) if fixed_errors: slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Successful Configurations That Override Previous Failures*", }, } ] ) for model, device, os_version in sorted(fixed_errors): slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"• {model} on {device} ({os_version})", }, } ) if new_errors: slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Failed Configurations That Override Previous Successes*", }, } ] ) for model, device, os_version in sorted(new_errors): slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"• {model} on {device} ({os_version})", }, } ) if new_configs: slack_blocks["blocks"].extend( [ { "type": "section", "text": { "type": "mrkdwn", "text": "*Newly Tested Configurations*", }, } ] ) for model, device, os_version in sorted(new_configs): slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": f"• {model} on {device} ({os_version})", }, } ) # Add alert if significant decrease in device count if needs_alert: slack_blocks["blocks"].append( { "type": "section", "text": { "type": "mrkdwn", "text": "⚠️ *ALERT:* Current device count is less than 90% of previous version's device count, test on more devices before updating the benchmark website!", }, } ) # Write to GITHUB_OUTPUT github_output = os.getenv("GITHUB_OUTPUT") if github_output: with open(github_output, "a") as f: f.write("slack_message_payload<