Spaces:

HASHIRUAgentX
/

hashiruAI

Running

File size: 3,957 Bytes

e09bf50

from gradio_client import Client
from datasets import load_dataset
import requests
import json
import time
import random
import os
import re
from datetime import datetime

# Fetch the official Wordle guess list from GitHub
WORD_LIST_URL = "https://raw.githubusercontent.com/tabatkins/wordle-list/main/words"

def load_word_list():
    resp = requests.get(WORD_LIST_URL)
    resp.raise_for_status()
    words = [w.strip().lower() for w in resp.text.splitlines()]
    return [w for w in words if len(w) == 5 and w.isalpha()]

WORD_LIST = load_word_list()


def get_last_assistant_content(resp):
    if isinstance(resp, tuple): resp = resp[0]
    if not isinstance(resp, list): return ""
    for turn in reversed(resp):
        if turn.get("role") != "assistant": continue
        if turn.get("content"): return turn["content"]
        fr = turn.get("function_response", {})
        out = fr.get("result", {}).get("output")
        if out: return out
        cont = turn.get("content")
        if isinstance(cont, dict):
            parts = cont.get("parts", [])
            if parts and parts[0].get("text"): return parts[0]["text"]
    return ""


def compute_feedback(guess, solution):
    feedback = ["B"] * 5
    sol = list(solution)
    for i, g in enumerate(guess):
        if g == sol[i]: feedback[i], sol[i] = "G", None
    for i, g in enumerate(guess):
        if feedback[i] == "B" and g in sol:
            feedback[i] = "Y"
            sol[sol.index(g)] = None
    return "".join(feedback)


def sanitize_guess(raw):
    raw = raw.lower()
    m = re.search(r"\b[a-z]{5}\b", raw)
    if m: return m.group(0)
    cleaned = re.sub(r"[^a-z]", "", raw)
    return cleaned[-5:]


def benchmark_wordle(num_games=10, max_guesses=6):
    client = Client("http://127.0.0.1:7860/")
    os.makedirs("results", exist_ok=True)
    out_path = os.path.join("results", f"wordle_benchmark_{datetime.now():%Y%m%d_%H%M%S}.jsonl")
    results = []

    for gi in range(num_games):
        solution = random.choice(WORD_LIST)
        print(f"Game {gi+1}/{num_games}, solution: {solution}")
        guesses = []
        attempts = 0
        start_time = time.time()

        while attempts < max_guesses:
            history = "\n".join(f"Guess: {g}, Feedback: {f}" for g, f in guesses)
            prompt = (
                f"Wordle game. Guess the 5-letter word.\n" +
                (history + "\n" if history else "") +
                "Respond with a single 5-letter guess and with ONLY YOUR GUESS. NO FILLER OR PUNCTUATION."
                "\n" + "Use the feedback format: G=green, Y=yellow, B=black.\n" +
                f"(Green: letter in correct position, Yellow: letter in wrong position, Black: letter not in word)\n" +
                f"Use tools and agents to help you guess the word.\n"
            )
            resp = client.predict(messages=[{"role": "user", "content": prompt}], api_name="/run")
            raw = get_last_assistant_content(resp)
            guess = sanitize_guess(raw)

            # If guess invalid, retry without counting
            if len(guess) != 5 or guess not in WORD_LIST:
                print(f"Warning: '{guess}' invalid; retrying without using a turn.")
                continue

            feedback = compute_feedback(guess, solution)
            guesses.append((guess, feedback))
            attempts += 1
            print(f"Attempt {attempts}: {guess} -> {feedback}")
            if feedback == "GGGGG":
                break

        results.append({
            "solution": solution,
            "guesses": guesses,
            "solved": bool(guesses and guesses[-1][1] == "GGGGG"),
            "turns": len(guesses),
            "time": time.time() - start_time
        })
        with open(out_path, "a") as f:
            f.write(json.dumps(results[-1]) + "\n")

    print(f"Benchmark complete, results saved to {out_path}")
    return results


if __name__ == "__main__":
    benchmark_wordle(num_games=1)