from gradio_client import Client from datasets import load_dataset import requests import json import time import random import os import re from datetime import datetime # Fetch the official Wordle guess list from GitHub WORD_LIST_URL = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words" def load_word_list(): resp = requests.get(WORD_LIST_URL) resp.raise_for_status() words = [w.strip().lower() for w in resp.text.splitlines()] return [w for w in words if len(w) == 5 and w.isalpha()] WORD_LIST = load_word_list() def get_last_assistant_content(resp): if isinstance(resp, tuple): resp = resp[0] if not isinstance(resp, list): return "" for turn in reversed(resp): if turn.get("role") != "assistant": continue if turn.get("content"): return turn["content"] fr = turn.get("function_response", {}) out = fr.get("result", {}).get("output") if out: return out cont = turn.get("content") if isinstance(cont, dict): parts = cont.get("parts", []) if parts and parts[0].get("text"): return parts[0]["text"] return "" def compute_feedback(guess, solution): feedback = ["B"] * 5 sol = list(solution) for i, g in enumerate(guess): if g == sol[i]: feedback[i], sol[i] = "G", None for i, g in enumerate(guess): if feedback[i] == "B" and g in sol: feedback[i] = "Y" sol[sol.index(g)] = None return "".join(feedback) def sanitize_guess(raw): raw = raw.lower() m = re.search(r"\b[a-z]{5}\b", raw) if m: return m.group(0) cleaned = re.sub(r"[^a-z]", "", raw) return cleaned[-5:] def benchmark_wordle(num_games=10, max_guesses=6): client = Client("http://127.0.0.1:7860/") os.makedirs("results", exist_ok=True) out_path = os.path.join("results", f"wordle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl") results = [] for gi in range(num_games): solution = random.choice(WORD_LIST) print(f"Game {gi+1}/{num_games}, solution: {solution}") guesses = [] attempts = 0 start_time = time.time() while attempts < max_guesses: history = "\n".join(f"Guess: {g}, Feedback: {f}" for g, f in guesses) prompt = ( f"Wordle game. Guess the 5-letter word.\n" + (history + "\n" if history else "") + "Respond with a single 5-letter guess and with ONLY YOUR GUESS. NO FILLER OR PUNCTUATION." "\n" + "Use the feedback format: G=green, Y=yellow, B=black.\n" + f"(Green: letter in correct position, Yellow: letter in wrong position, Black: letter not in word)\n" + f"Use tools and agents to help you guess the word.\n" ) resp = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run") raw = get_last_assistant_content(resp) guess = sanitize_guess(raw) # If guess invalid, retry without counting if len(guess) != 5 or guess not in WORD_LIST: print(f"Warning: '{guess}' invalid; retrying without using a turn.") continue feedback = compute_feedback(guess, solution) guesses.append((guess, feedback)) attempts += 1 print(f"Attempt {attempts}: {guess} -> {feedback}") if feedback == "GGGGG": break results.append({ "solution": solution, "guesses": guesses, "solved": bool(guesses and guesses[-1][1] == "GGGGG"), "turns": len(guesses), "time": time.time() - start_time }) with open(out_path, "a") as f: f.write(json.dumps(results[-1]) + "\n") print(f"Benchmark complete, results saved to {out_path}") return results if __name__ == "__main__": benchmark_wordle(num_games=1)