hashiruAI / bench /benchmarking_wordle.py
Kunal Pai
Add benchmarking script for Wordle game
e09bf50
raw
history blame
3.96 kB
from gradio_client import Client
from datasets import load_dataset
import requests
import json
import time
import random
import os
import re
from datetime import datetime
# Fetch the official Wordle guess list from GitHub
WORD_LIST_URL = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"
def load_word_list():
resp = requests.get(WORD_LIST_URL)
resp.raise_for_status()
words = [w.strip().lower() for w in resp.text.splitlines()]
return [w for w in words if len(w) == 5 and w.isalpha()]
WORD_LIST = load_word_list()
def get_last_assistant_content(resp):
if isinstance(resp, tuple): resp = resp[0]
if not isinstance(resp, list): return ""
for turn in reversed(resp):
if turn.get("role") != "assistant": continue
if turn.get("content"): return turn["content"]
fr = turn.get("function_response", {})
out = fr.get("result", {}).get("output")
if out: return out
cont = turn.get("content")
if isinstance(cont, dict):
parts = cont.get("parts", [])
if parts and parts[0].get("text"): return parts[0]["text"]
return ""
def compute_feedback(guess, solution):
feedback = ["B"] * 5
sol = list(solution)
for i, g in enumerate(guess):
if g == sol[i]: feedback[i], sol[i] = "G", None
for i, g in enumerate(guess):
if feedback[i] == "B" and g in sol:
feedback[i] = "Y"
sol[sol.index(g)] = None
return "".join(feedback)
def sanitize_guess(raw):
raw = raw.lower()
m = re.search(r"\b[a-z]{5}\b", raw)
if m: return m.group(0)
cleaned = re.sub(r"[^a-z]", "", raw)
return cleaned[-5:]
def benchmark_wordle(num_games=10, max_guesses=6):
client = Client("http://127.0.0.1:7860/")
os.makedirs("results", exist_ok=True)
out_path = os.path.join("results", f"wordle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl")
results = []
for gi in range(num_games):
solution = random.choice(WORD_LIST)
print(f"Game {gi+1}/{num_games}, solution: {solution}")
guesses = []
attempts = 0
start_time = time.time()
while attempts < max_guesses:
history = "\n".join(f"Guess: {g}, Feedback: {f}" for g, f in guesses)
prompt = (
f"Wordle game. Guess the 5-letter word.\n" +
(history + "\n" if history else "") +
"Respond with a single 5-letter guess and with ONLY YOUR GUESS. NO FILLER OR PUNCTUATION."
"\n" + "Use the feedback format: G=green, Y=yellow, B=black.\n" +
f"(Green: letter in correct position, Yellow: letter in wrong position, Black: letter not in word)\n" +
f"Use tools and agents to help you guess the word.\n"
)
resp = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run")
raw = get_last_assistant_content(resp)
guess = sanitize_guess(raw)
# If guess invalid, retry without counting
if len(guess) != 5 or guess not in WORD_LIST:
print(f"Warning: '{guess}' invalid; retrying without using a turn.")
continue
feedback = compute_feedback(guess, solution)
guesses.append((guess, feedback))
attempts += 1
print(f"Attempt {attempts}: {guess} -> {feedback}")
if feedback == "GGGGG":
break
results.append({
"solution": solution,
"guesses": guesses,
"solved": bool(guesses and guesses[-1][1] == "GGGGG"),
"turns": len(guesses),
"time": time.time() - start_time
})
with open(out_path, "a") as f:
f.write(json.dumps(results[-1]) + "\n")
print(f"Benchmark complete, results saved to {out_path}")
return results
if __name__ == "__main__":
benchmark_wordle(num_games=1)