Spaces:

mozilla-ai
/

any-agent-demo

Running

App Files Files Community

github-actions[bot] commited on about 11 hours ago

Commit

7e103cf

1 Parent(s): d78bdc9

Sync with https://github.com/mozilla-ai/any-agent-demo

Browse files

Files changed (12) hide show

README.md +0 -7
app.py +7 -14
components/agent_status.py +70 -35
components/inputs.py +34 -46
components/sidebar.py +3 -3
config.py +43 -0
constants.py +28 -46
requirements.txt +3 -4
services/agent.py +62 -94
tools/__init__.py +9 -0
tools/openmeteo.py +117 -0
tools/openstreetmap.py +62 -0

README.md CHANGED Viewed

@@ -11,10 +11,3 @@ pinned: false
 short_description: Find a surf spot near you
 license: apache-2.0
 ---
-# Welcome to Streamlit!
-Edit `/src/app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 short_description: Find a surf spot near you
 license: apache-2.0
 ---

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from components.sidebar import ssf_sidebar
-from constants import DEFAULT_TOOLS
-import streamlit as st
 import asyncio
 import nest_asyncio
 from services.agent import (
     configure_agent,
     display_evaluation_results,
@@ -13,14 +14,11 @@ from services.agent import (
 nest_asyncio.apply()
-# Set page config
 st.set_page_config(page_title="Surf Spot Finder", page_icon="🏄", layout="wide")
-# Allow a user to resize the sidebar to take up most of the screen to make editing eval cases easier
 st.markdown(
     """
     <style>
-        /* When sidebar is expanded, adjust main content */
         section[data-testid="stSidebar"][aria-expanded="true"] {
             max-width: 99% !important;
         }
@@ -35,18 +33,16 @@ with st.sidebar:
     run_button = st.button("Run Agent 🤖", disabled=not is_valid, type="primary")
-# Main content
 async def main():
-    # Handle agent execution button click
     if run_button:
         agent, agent_config = await configure_agent(user_inputs)
         agent_trace = await run_agent(agent, agent_config)
         await display_output(agent_trace)
-        evaluation_result = await evaluate_agent(agent_config, agent_trace)
-        await display_evaluation_results(evaluation_result)
     else:
         st.title("🏄 Surf Spot Finder")
         st.markdown(
@@ -56,7 +52,6 @@ async def main():
             "👈 Configure your search parameters in the sidebar and click Run to start!"
         )
-        # Display tools in a more organized way
         st.markdown("### 🛠️ Available Tools")
         st.markdown("""
@@ -92,7 +87,6 @@ async def main():
             with st.expander(f"🌐 {tool.__name__}"):
                 st.markdown(tool.__doc__ or "No description available")
-        # add a check that all tools were listed
         if len(weather_tools) + len(location_tools) + len(web_tools) != len(
             DEFAULT_TOOLS
         ):
@@ -100,7 +94,6 @@ async def main():
                 "Some tools are not listed. Please check the code for more details."
             )
-        # Add Custom Evaluation explanation section
         st.markdown("### 📊 Custom Evaluation")
         st.markdown("""
         The Surf Spot Finder includes a powerful evaluation system that allows you to customize how the agent's performance is assessed.

 import asyncio
 import nest_asyncio
+import streamlit as st
+from components.sidebar import ssf_sidebar
+from constants import DEFAULT_TOOLS
 from services.agent import (
     configure_agent,
     display_evaluation_results,
 nest_asyncio.apply()
 st.set_page_config(page_title="Surf Spot Finder", page_icon="🏄", layout="wide")
 st.markdown(
     """
     <style>
         section[data-testid="stSidebar"][aria-expanded="true"] {
             max-width: 99% !important;
         }
     run_button = st.button("Run Agent 🤖", disabled=not is_valid, type="primary")
 async def main():
     if run_button:
         agent, agent_config = await configure_agent(user_inputs)
         agent_trace = await run_agent(agent, agent_config)
         await display_output(agent_trace)
+        if user_inputs.run_evaluation:
+            evaluation_results = await evaluate_agent(agent_config, agent_trace)
+            await display_evaluation_results(evaluation_results)
     else:
         st.title("🏄 Surf Spot Finder")
         st.markdown(
             "👈 Configure your search parameters in the sidebar and click Run to start!"
         )
         st.markdown("### 🛠️ Available Tools")
         st.markdown("""
             with st.expander(f"🌐 {tool.__name__}"):
                 st.markdown(tool.__doc__ or "No description available")
         if len(weather_tools) + len(location_tools) + len(web_tools) != len(
             DEFAULT_TOOLS
         ):
                 "Some tools are not listed. Please check the code for more details."
             )
         st.markdown("### 📊 Custom Evaluation")
         st.markdown("""
         The Surf Spot Finder includes a powerful evaluation system that allows you to customize how the agent's performance is assessed.

components/agent_status.py CHANGED Viewed

@@ -1,47 +1,82 @@
-from any_agent import AnyAgent
-from opentelemetry.sdk.trace.export import SimpleSpanProcessor
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Callable
-from opentelemetry.sdk.trace.export import (
-    SpanExporter,
-    SpanExportResult,
-)
-from any_agent import AgentFramework
-from any_agent.tracing import TracingProcessor
-from any_agent.tracing.trace import AgentSpan
-if TYPE_CHECKING:
-    from opentelemetry.sdk.trace import ReadableSpan
-class StreamlitExporter(SpanExporter):
-    """Build an `AgentTrace` and export to the different outputs."""
-    def __init__(  # noqa: D107
-        self, agent_framework: AgentFramework, callback: Callable
-    ):
-        self.agent_framework = agent_framework
-        self.processor: TracingProcessor | None = TracingProcessor.create(
-            agent_framework
-        )
-        self.callback = callback
-    def export(self, spans: Sequence["ReadableSpan"]) -> SpanExportResult:  # noqa: D102
-        if not self.processor:
-            return SpanExportResult.SUCCESS
-        for readable_span in spans:
-            # Check if this span belongs to our run
-            span = AgentSpan.from_readable_span(readable_span)
-            self.callback(span)
-        return SpanExportResult.SUCCESS
-def export_logs(agent: AnyAgent, callback: Callable) -> None:
-    exporter = StreamlitExporter(agent.framework, callback)
-    span_processor = SimpleSpanProcessor(exporter)
-    agent._tracer_provider.add_span_processor(span_processor)

+from collections.abc import Callable
+from typing import Any
+from any_agent.callbacks import Callback, Context
+from any_agent.tracing.attributes import GenAI
+class StreamlitStatusCallback(Callback):
+    """Callback to update Streamlit status with agent progress."""
+    def __init__(self, status_callback: Callable[[str], None]):
+        self.status_callback = status_callback
+    def after_llm_call(self, context: Context, *args, **kwargs) -> Context:
+        """Update status after LLM calls."""
+        span = context.current_span
+        input_value = span.attributes.get(GenAI.INPUT_MESSAGES, "")
+        output_value = span.attributes.get(GenAI.OUTPUT, "")
+        self._update_status(span.name, input_value, output_value)
+        return context
+    def after_tool_execution(self, context: Context, *args, **kwargs) -> Context:
+        """Update status after tool executions."""
+        span = context.current_span
+        input_value = span.attributes.get(GenAI.TOOL_ARGS, "")
+        output_value = span.attributes.get(GenAI.OUTPUT, "")
+        self._update_status(span.name, input_value, output_value)
+        return context
+    def _update_status(self, step_name: str, input_value: str, output_value: str):
+        """Update the Streamlit status with formatted information."""
+        if input_value:
+            try:
+                import json
+                parsed_input = json.loads(input_value)
+                if isinstance(parsed_input, list) and len(parsed_input) > 0:
+                    input_value = str(parsed_input[-1])
+            except Exception:
+                pass
+        if output_value:
+            try:
+                import json
+                parsed_output = json.loads(output_value)
+                if isinstance(parsed_output, list) and len(parsed_output) > 0:
+                    output_value = str(parsed_output[-1])
+            except Exception:
+                pass
+        max_length = 800
+        if len(input_value) > max_length:
+            input_value = f"[Truncated]...{input_value[-max_length:]}"
+        if len(output_value) > max_length:
+            output_value = f"[Truncated]...{output_value[-max_length:]}"
+        if input_value or output_value:
+            message = f"Step: {step_name}\n"
+            if input_value:
+                message += f"Input: {input_value}\n"
+            if output_value:
+                message += f"Output: {output_value}"
+        else:
+            message = f"Step: {step_name}"
+        self.status_callback(message)
+def export_logs(agent: Any, callback: Callable[[str], None]) -> None:
+    """Add a Streamlit status callback to the agent.
+    This function adds a custom callback to the agent that will update
+    the Streamlit status with progress information during agent execution.
+    """
+    status_callback = StreamlitStatusCallback(callback)
+    if agent.config.callbacks is None:
+        agent.config.callbacks = []
+    agent.config.callbacks.append(status_callback)

components/inputs.py CHANGED Viewed

@@ -1,17 +1,19 @@
-from datetime import datetime, timedelta
 import json
 import requests
 import streamlit as st
-from any_agent import AgentFramework
-from any_agent.tracing.trace import _is_tracing_supported
-from any_agent.evaluation import EvaluationCase
-from any_agent.evaluation.schemas import CheckpointCriteria
-import pandas as pd
-from constants import DEFAULT_EVALUATION_CASE, MODEL_OPTIONS
-import copy
 from pydantic import BaseModel, ConfigDict
 class UserInputs(BaseModel):
     model_config = ConfigDict(extra="forbid")
@@ -20,7 +22,8 @@ class UserInputs(BaseModel):
     max_driving_hours: int
     date: datetime
     framework: str
-    evaluation_case: EvaluationCase
     run_evaluation: bool
@@ -35,6 +38,7 @@ def get_area(area_name: str) -> dict:
     Returns:
         dict: The area found.
     """
     response = requests.get(
         f"https://nominatim.openstreetmap.org/search?q={area_name}&format=jsonv2",
@@ -42,8 +46,7 @@ def get_area(area_name: str) -> dict:
         timeout=5,
     )
     response.raise_for_status()
-    response_json = json.loads(response.content.decode())
-    return response_json
 def get_user_inputs() -> UserInputs:
@@ -65,7 +68,6 @@ def get_user_inputs() -> UserInputs:
             "Select a date in the future", value=datetime.now() + timedelta(days=1)
         )
     with col_time:
-        # default to 9am
         time = st.selectbox(
             "Select a time",
             [datetime.strptime(f"{i:02d}:00", "%H:%M").time() for i in range(24)],
@@ -73,9 +75,7 @@ def get_user_inputs() -> UserInputs:
         )
     date = datetime.combine(date, time)
-    supported_frameworks = [
-        framework for framework in AgentFramework if _is_tracing_supported(framework)
-    ]
     framework = st.selectbox(
         "Select the agent framework to use",
@@ -91,7 +91,6 @@ def get_user_inputs() -> UserInputs:
         format_func=lambda x: "/".join(x.split("/")[-3:]),
     )
-    # Add evaluation case section
     with st.expander("Custom Evaluation"):
         evaluation_model_id = st.selectbox(
             "Select the model to use for LLM-as-a-Judge evaluation",
@@ -99,47 +98,35 @@ def get_user_inputs() -> UserInputs:
             index=2,
             format_func=lambda x: "/".join(x.split("/")[-3:]),
         )
-        evaluation_case = copy.deepcopy(DEFAULT_EVALUATION_CASE)
-        evaluation_case.llm_judge = evaluation_model_id
-        # make this an editable json section
-        # convert the checkpoints to a df series so that it can be edited
-        checkpoints = evaluation_case.checkpoints
-        checkpoints_df = pd.DataFrame(
-            [checkpoint.model_dump() for checkpoint in checkpoints]
-        )
-        checkpoints_df = st.data_editor(
-            checkpoints_df,
             column_config={
-                "points": st.column_config.NumberColumn(label="Points"),
                 "criteria": st.column_config.TextColumn(label="Criteria"),
             },
             hide_index=True,
             num_rows="dynamic",
         )
-        # for each checkpoint, convert it back to a CheckpointCriteria object
-        new_ckpts = []
-        # don't let a user add more than 20 checkpoints
-        if len(checkpoints_df) > 20:
-            st.error(
-                "You can only add up to 20 checkpoints for the purpose of this demo."
-            )
-            checkpoints_df = checkpoints_df[:20]
-        for _, row in checkpoints_df.iterrows():
             if row["criteria"] == "":
                 continue
             try:
-                # Don't let people write essays for criteria in this demo
                 if len(row["criteria"].split(" ")) > 100:
-                    raise ValueError("Criteria is too long")
-                new_crit = CheckpointCriteria(
-                    criteria=row["criteria"], points=row["points"]
-                )
-                new_ckpts.append(new_crit)
             except Exception as e:
-                st.error(f"Error creating checkpoint: {e}")
-        evaluation_case.checkpoints = new_ckpts
     return UserInputs(
         model_id=model_id,
@@ -147,6 +134,7 @@ def get_user_inputs() -> UserInputs:
         max_driving_hours=max_driving_hours,
         date=date,
         framework=framework,
-        evaluation_case=evaluation_case,
         run_evaluation=st.checkbox("Run Evaluation", value=True),
     )

+import copy
 import json
+from datetime import datetime, timedelta
+import pandas as pd
 import requests
 import streamlit as st
+from constants import (
+    DEFAULT_EVALUATION_CRITERIA,
+    DEFAULT_EVALUATION_MODEL,
+    MODEL_OPTIONS,
+)
 from pydantic import BaseModel, ConfigDict
+from any_agent import AgentFramework
 class UserInputs(BaseModel):
     model_config = ConfigDict(extra="forbid")
     max_driving_hours: int
     date: datetime
     framework: str
+    evaluation_model: str
+    evaluation_criteria: list[dict[str, str]]
     run_evaluation: bool
     Returns:
         dict: The area found.
     """
     response = requests.get(
         f"https://nominatim.openstreetmap.org/search?q={area_name}&format=jsonv2",
         timeout=5,
     )
     response.raise_for_status()
+    return json.loads(response.content.decode())
 def get_user_inputs() -> UserInputs:
             "Select a date in the future", value=datetime.now() + timedelta(days=1)
         )
     with col_time:
         time = st.selectbox(
             "Select a time",
             [datetime.strptime(f"{i:02d}:00", "%H:%M").time() for i in range(24)],
         )
     date = datetime.combine(date, time)
+    supported_frameworks = [framework for framework in AgentFramework]
     framework = st.selectbox(
         "Select the agent framework to use",
         format_func=lambda x: "/".join(x.split("/")[-3:]),
     )
     with st.expander("Custom Evaluation"):
         evaluation_model_id = st.selectbox(
             "Select the model to use for LLM-as-a-Judge evaluation",
             index=2,
             format_func=lambda x: "/".join(x.split("/")[-3:]),
         )
+        evaluation_criteria = copy.deepcopy(DEFAULT_EVALUATION_CRITERIA)
+        criteria_df = pd.DataFrame(evaluation_criteria)
+        criteria_df = st.data_editor(
+            criteria_df,
             column_config={
                 "criteria": st.column_config.TextColumn(label="Criteria"),
             },
             hide_index=True,
             num_rows="dynamic",
         )
+        new_criteria = []
+        if len(criteria_df) > 20:
+            st.error("You can only add up to 20 criteria for the purpose of this demo.")
+            criteria_df = criteria_df[:20]
+        for _, row in criteria_df.iterrows():
             if row["criteria"] == "":
                 continue
             try:
                 if len(row["criteria"].split(" ")) > 100:
+                    msg = "Criteria is too long"
+                    raise ValueError(msg)
+                new_criteria.append({"criteria": row["criteria"]})
             except Exception as e:
+                st.error(f"Error creating criterion: {e}")
     return UserInputs(
         model_id=model_id,
         max_driving_hours=max_driving_hours,
         date=date,
         framework=framework,
+        evaluation_model=evaluation_model_id,
+        evaluation_criteria=new_criteria,
         run_evaluation=st.checkbox("Run Evaluation", value=True),
     )

components/sidebar.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from components.inputs import UserInputs, get_user_inputs
 import streamlit as st
 def ssf_sidebar() -> UserInputs:
     st.markdown("### Configuration")
     st.markdown("Built using [Any-Agent](https://github.com/mozilla-ai/any-agent)")
-    user_inputs = get_user_inputs()
-    return user_inputs

 import streamlit as st
+from components.inputs import UserInputs, get_user_inputs
 def ssf_sidebar() -> UserInputs:
     st.markdown("### Configuration")
     st.markdown("Built using [Any-Agent](https://github.com/mozilla-ai/any-agent)")
+    return get_user_inputs()

config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import tempfile
+from datetime import datetime, timedelta
+from typing import Annotated
+import geocoder
+from pydantic import AfterValidator, BaseModel, ConfigDict, FutureDatetime, PositiveInt
+from rich.prompt import Prompt
+from any_agent import AgentFramework
+from any_agent.config import AgentConfig
+from any_agent.logging import logger
+INPUT_PROMPT_TEMPLATE = """
+According to the forecast, what will be the best spot to surf around {LOCATION},
+in a {MAX_DRIVING_HOURS} hour driving radius,
+at {DATE}?"
+""".strip()
+def validate_prompt(value) -> str:
+    for placeholder in ("{LOCATION}", "{MAX_DRIVING_HOURS}", "{DATE}"):
+        if placeholder not in value:
+            raise ValueError(f"prompt must contain {placeholder}")
+    return value
+class Config(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    location: str
+    max_driving_hours: PositiveInt
+    date: FutureDatetime
+    input_prompt_template: Annotated[str, AfterValidator(validate_prompt)] = (
+        INPUT_PROMPT_TEMPLATE
+    )
+    framework: AgentFramework
+    main_agent: AgentConfig
+    evaluation_model: str | None = None
+    evaluation_criteria: list[dict[str, str]] | None = None

constants.py CHANGED Viewed

@@ -1,65 +1,47 @@
 import os
-from any_agent.evaluation import EvaluationCase
-from surf_spot_finder.tools import (
     get_area_lat_lon,
     get_wave_forecast,
     get_wind_forecast,
 )
 from any_agent.logging import logger
-from any_agent.tools.web_browsing import search_web, visit_webpage, search_tavily
 MODEL_OPTIONS = [
-    # "huggingface/novita/deepseek-ai/DeepSeek-V3",
-    # "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
     "openai/gpt-4.1-nano",
     "openai/gpt-4.1-mini",
     "openai/gpt-4o",
     "gemini/gemini-2.0-flash-lite",
     "gemini/gemini-2.0-flash",
-    # "huggingface/Qwen/Qwen3-32B", # right now throwing an internal error, but novita qwen isn't supporting tool calling
 ]
-# Novita was the only HF based provider that worked.
-# Hugginface API Provider Error:
-# Must alternate between assistant/user, which meant that the 'tool' role made it puke
-DEFAULT_EVALUATION_CASE = EvaluationCase(
-    llm_judge=MODEL_OPTIONS[0],
-    checkpoints=[
-        {
-            "criteria": "Check if the agent considered at least three surf spot options",
-            "points": 1,
-        },
-        {
-            "criteria": "Check if the agent gathered wind forecasts for each surf spot being evaluated.",
-            "points": 1,
-        },
-        {
-            "criteria": "Check if the agent gathered wave forecasts for each surf spot being evaluated.",
-            "points": 1,
-        },
-        {
-            "criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
-            "points": 1,
-        },
-        {
-            "criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
-            "points": 1,
-        },
-        {
-            "criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
-            "points": 1,
-        },
-        {
-            "criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
-            "points": 1,
-        },
-    ],
-)
 DEFAULT_TOOLS = [
     get_wind_forecast,

 import os
+from tools import (
     get_area_lat_lon,
     get_wave_forecast,
     get_wind_forecast,
 )
 from any_agent.logging import logger
+from any_agent.tools.web_browsing import search_tavily, search_web, visit_webpage
 MODEL_OPTIONS = [
     "openai/gpt-4.1-nano",
     "openai/gpt-4.1-mini",
     "openai/gpt-4o",
     "gemini/gemini-2.0-flash-lite",
     "gemini/gemini-2.0-flash",
 ]
+DEFAULT_EVALUATION_MODEL = MODEL_OPTIONS[0]
+DEFAULT_EVALUATION_CRITERIA = [
+    {
+        "criteria": "Check if the agent considered at least three surf spot options",
+    },
+    {
+        "criteria": "Check if the agent gathered wind forecasts for each surf spot being evaluated.",
+    },
+    {
+        "criteria": "Check if the agent gathered wave forecasts for each surf spot being evaluated.",
+    },
+    {
+        "criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
+    },
+    {
+        "criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
+    },
+    {
+        "criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
+    },
+    {
+        "criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
+    },
+]
 DEFAULT_TOOLS = [
     get_wind_forecast,

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-streamlit
-openai-agents>=0.0.14
-any-agent[all]==0.15.0
-surf-spot-finder @ git+https://github.com/mozilla-ai/surf-spot-finder
 nest_asyncio

+any-agent[all]>=1.4.0
+geocoder
 nest_asyncio
+streamlit

services/agent.py CHANGED Viewed

@@ -1,63 +1,65 @@
 import json
 from components.inputs import UserInputs
 from constants import DEFAULT_TOOLS
-from components.agent_status import export_logs
-import streamlit as st
-from surf_spot_finder.config import Config
-from any_agent import AgentConfig, AnyAgent, TracingConfig, AgentFramework
-from any_agent.tracing.trace import AgentTrace, AgentSpan
-from any_agent.tracing.otel_types import StatusCode
-from any_agent.evaluation import evaluate, TraceEvaluationResult
-async def display_evaluation_results(result: TraceEvaluationResult):
-    if result.ground_truth_result is not None:
-        all_results = [*result.checkpoint_results, result.ground_truth_result]
-    else:
-        all_results = result.checkpoint_results
-    # Create columns for better layout
     col1, col2 = st.columns(2)
     with col1:
         st.markdown("#### Criteria Results")
-        for checkpoint in all_results:
-            if checkpoint.passed:
-                st.success(f"✅ {checkpoint.criteria}")
             else:
-                st.error(f"❌ {checkpoint.criteria}")
     with col2:
         st.markdown("#### Overall Score")
-        total_points = sum([result.points for result in all_results])
-        if total_points == 0:
-            msg = "Total points is 0, cannot calculate score."
-            raise ValueError(msg)
-        passed_points = sum([result.points for result in all_results if result.passed])
-        # Create a nice score display
-        st.markdown(f"### {passed_points}/{total_points}")
-        percentage = (passed_points / total_points) * 100
         st.progress(percentage / 100)
         st.markdown(f"**{percentage:.1f}%**")
 async def evaluate_agent(
     config: Config, agent_trace: AgentTrace
-) -> TraceEvaluationResult:
-    assert (
-        len(config.evaluation_cases) == 1
-    ), "Only one evaluation case is supported in the demo"
     st.markdown("### 📊 Evaluation Results")
     with st.spinner("Evaluating results..."):
-        case = config.evaluation_cases[0]
-        result: TraceEvaluationResult = evaluate(
-            evaluation_case=case,
-            trace=agent_trace,
-            agent_framework=config.framework,
-        )
-    return result
 async def configure_agent(user_inputs: UserInputs) -> tuple[AnyAgent, Config]:
@@ -87,47 +89,52 @@ async def configure_agent(user_inputs: UserInputs) -> tuple[AnyAgent, Config]:
         date=user_inputs.date,
         framework=user_inputs.framework,
         main_agent=agent_config,
-        managed_agents=[],
-        evaluation_cases=[user_inputs.evaluation_case],
     )
     agent = await AnyAgent.create_async(
         agent_framework=config.framework,
         agent_config=config.main_agent,
-        managed_agents=config.managed_agents,
-        tracing=TracingConfig(console=True, cost_info=True),
     )
     return agent, config
 async def display_output(agent_trace: AgentTrace):
-    # Display the agent trace in a more organized way
     with st.expander("### 🧩 Agent Trace"):
         for span in agent_trace.spans:
-            # Header with name and status
             col1, col2 = st.columns([4, 1])
             with col1:
                 st.markdown(f"**{span.name}**")
                 if span.attributes:
-                    # st.json(span.attributes, expanded=False)
-                    if "input.value" in span.attributes:
                         try:
-                            input_value = json.loads(span.attributes["input.value"])
                             if isinstance(input_value, list) and len(input_value) > 0:
                                 st.write(f"Input: {input_value[-1]}")
                             else:
                                 st.write(f"Input: {input_value}")
-                        except Exception:  # noqa: E722
-                            st.write(f"Input: {span.attributes['input.value']}")
-                    if "output.value" in span.attributes:
                         try:
-                            output_value = json.loads(span.attributes["output.value"])
                             if isinstance(output_value, list) and len(output_value) > 0:
                                 st.write(f"Output: {output_value[-1]}")
                             else:
                                 st.write(f"Output: {output_value}")
-                        except Exception:  # noqa: E722
-                            st.write(f"Output: {span.attributes['output.value']}")
             with col2:
                 status_color = (
                     "green" if span.status.status_code == StatusCode.OK else "red"
@@ -145,7 +152,7 @@ async def display_output(agent_trace: AgentTrace):
         with cost_col:
             st.info(f"💰 Estimated Cost: ${agent_trace.cost.total_cost:.6f}")
         with tokens_col:
-            st.info(f"📦 Total Tokens: {agent_trace.usage.total_tokens:,}")
         st.markdown("#### Final Output")
         st.info(agent_trace.final_output)
@@ -179,49 +186,10 @@ async def run_agent(agent, config) -> AgentTrace:
     with st.status("Agent is running...", expanded=False, state="running") as status:
-        def update_span(span: AgentSpan):
-            # Process input value
-            input_value = span.attributes.get("input.value", "")
-            if input_value:
-                try:
-                    parsed_input = json.loads(input_value)
-                    if isinstance(parsed_input, list) and len(parsed_input) > 0:
-                        input_value = str(parsed_input[-1])
-                except Exception:
-                    pass
-            # Process output value
-            output_value = span.attributes.get("output.value", "")
-            if output_value:
-                try:
-                    parsed_output = json.loads(output_value)
-                    if isinstance(parsed_output, list) and len(parsed_output) > 0:
-                        output_value = str(parsed_output[-1])
-                except Exception:
-                    pass
-            # Truncate long values
-            max_length = 800
-            if len(input_value) > max_length:
-                input_value = f"[Truncated]...{input_value[-max_length:]}"
-            if len(output_value) > max_length:
-                output_value = f"[Truncated]...{output_value[-max_length:]}"
-            # Create a cleaner message format
-            if input_value or output_value:
-                message = f"Step: {span.name}\n"
-                if input_value:
-                    message += f"Input: {input_value}\n"
-                if output_value:
-                    message += f"Output: {output_value}"
-            else:
-                message = f"Step: {span.name}\n{span}"
             status.update(label=message, expanded=False, state="running")
-        export_logs(agent, update_span)
         agent_trace: AgentTrace = await agent.run_async(query, **kwargs)
         status.update(label="Finished!", expanded=False, state="complete")
-        agent.exit()
         return agent_trace

 import json
+import streamlit as st
+from components.agent_status import export_logs
 from components.inputs import UserInputs
 from constants import DEFAULT_TOOLS
+from config import Config
+from any_agent import AgentConfig, AgentFramework, AnyAgent
+from any_agent.evaluation import LlmJudge
+from any_agent.evaluation.schemas import EvaluationOutput
+from any_agent.tracing.agent_trace import AgentTrace
+from any_agent.tracing.attributes import GenAI
+from any_agent.tracing.otel_types import StatusCode
+async def display_evaluation_results(results: list[EvaluationOutput]):
     col1, col2 = st.columns(2)
     with col1:
         st.markdown("#### Criteria Results")
+        for i, result in enumerate(results):
+            if result.passed:
+                st.success(f"✅ Criterion {i + 1}")
             else:
+                st.error(f"❌ Criterion {i + 1}")
+            st.write(f"**Reasoning:** {result.reasoning}")
     with col2:
         st.markdown("#### Overall Score")
+        total_criteria = len(results)
+        passed_criteria = sum(1 for result in results if result.passed)
+        st.markdown(f"### {passed_criteria}/{total_criteria}")
+        percentage = (
+            (passed_criteria / total_criteria) * 100 if total_criteria > 0 else 0
+        )
         st.progress(percentage / 100)
         st.markdown(f"**{percentage:.1f}%**")
 async def evaluate_agent(
     config: Config, agent_trace: AgentTrace
+) -> list[EvaluationOutput]:
     st.markdown("### 📊 Evaluation Results")
     with st.spinner("Evaluating results..."):
+        results = []
+        judge = LlmJudge(model_id=config.evaluation_model, framework=config.framework)
+        for i, criterion in enumerate(config.evaluation_criteria):
+            context = f"Agent Trace:\n{agent_trace.model_dump_json(indent=2)}"
+            result = await judge.run_async(
+                context=context, question=criterion["criteria"]
+            )
+            results.append(result)
+            st.write(f"Evaluated criterion {i + 1}/{len(config.evaluation_criteria)}")
+    return results
 async def configure_agent(user_inputs: UserInputs) -> tuple[AnyAgent, Config]:
         date=user_inputs.date,
         framework=user_inputs.framework,
         main_agent=agent_config,
+        evaluation_model=user_inputs.evaluation_model,
+        evaluation_criteria=user_inputs.evaluation_criteria,
     )
     agent = await AnyAgent.create_async(
         agent_framework=config.framework,
         agent_config=config.main_agent,
     )
     return agent, config
 async def display_output(agent_trace: AgentTrace):
     with st.expander("### 🧩 Agent Trace"):
         for span in agent_trace.spans:
             col1, col2 = st.columns([4, 1])
             with col1:
                 st.markdown(f"**{span.name}**")
                 if span.attributes:
+                    if GenAI.INPUT_MESSAGES in span.attributes:
                         try:
+                            input_value = json.loads(
+                                span.attributes[GenAI.INPUT_MESSAGES]
+                            )
                             if isinstance(input_value, list) and len(input_value) > 0:
                                 st.write(f"Input: {input_value[-1]}")
                             else:
                                 st.write(f"Input: {input_value}")
+                        except Exception:
+                            st.write(f"Input: {span.attributes[GenAI.INPUT_MESSAGES]}")
+                    if GenAI.TOOL_ARGS in span.attributes:
                         try:
+                            tool_args = json.loads(span.attributes[GenAI.TOOL_ARGS])
+                            st.write(f"Tool Args: {tool_args}")
+                        except Exception:
+                            st.write(f"Tool Args: {span.attributes[GenAI.TOOL_ARGS]}")
+                    if GenAI.OUTPUT in span.attributes:
+                        try:
+                            output_value = json.loads(span.attributes[GenAI.OUTPUT])
                             if isinstance(output_value, list) and len(output_value) > 0:
                                 st.write(f"Output: {output_value[-1]}")
                             else:
                                 st.write(f"Output: {output_value}")
+                        except Exception:
+                            st.write(f"Output: {span.attributes[GenAI.OUTPUT]}")
             with col2:
                 status_color = (
                     "green" if span.status.status_code == StatusCode.OK else "red"
         with cost_col:
             st.info(f"💰 Estimated Cost: ${agent_trace.cost.total_cost:.6f}")
         with tokens_col:
+            st.info(f"📦 Total Tokens: {agent_trace.tokens.total_tokens:,}")
         st.markdown("#### Final Output")
         st.info(agent_trace.final_output)
     with st.status("Agent is running...", expanded=False, state="running") as status:
+        def update_status(message: str):
             status.update(label=message, expanded=False, state="running")
+        export_logs(agent, update_status)
         agent_trace: AgentTrace = await agent.run_async(query, **kwargs)
         status.update(label="Finished!", expanded=False, state="complete")
         return agent_trace

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .openmeteo import get_wave_forecast, get_wind_forecast
+from .openstreetmap import driving_hours_to_meters, get_area_lat_lon
+__all__ = [
+    "driving_hours_to_meters",
+    "get_area_lat_lon",
+    "get_wave_forecast",
+    "get_wind_forecast",
+]

tools/openmeteo.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import json
+from datetime import datetime, timedelta
+import requests
+def _extract_hourly_data(data: dict) -> list[dict]:
+    hourly_data = data["hourly"]
+    result = [
+        {k: v for k, v in zip(hourly_data.keys(), values, strict=False)}
+        for values in zip(*hourly_data.values(), strict=False)
+    ]
+    return result
+def _filter_by_date(
+    date: datetime, hourly_data: list[dict], timedelta: timedelta = timedelta(hours=1)
+):
+    start_date = date - timedelta
+    end_date = date + timedelta
+    return [
+        item
+        for item in hourly_data
+        if start_date <= datetime.fromisoformat(item["time"]) <= end_date
+    ]
+def get_wave_forecast(lat: float, lon: float, date: str) -> list[dict]:
+    """Get wave forecast for given location.
+    Forecast will include:
+    - wave_direction (degrees)
+    - wave_height (meters)
+    - wave_period (seconds)
+    - sea_level_height_msl (meters)
+    Args:
+        lat: Latitude of the location.
+        lon: Longitude of the location.
+        date: Date to filter by in any valid ISO 8601 format.
+    Returns:
+        Hourly data for wave forecast.
+            Example output:
+            ```json
+            [
+                {'time': '2025-03-19T09:00', 'winddirection_10m': 140, 'windspeed_10m': 24.5}, {'time': '2025-03-19T10:00', 'winddirection_10m': 140, 'windspeed_10m': 27.1},
+                {'time': '2025-03-19T10:00', 'winddirection_10m': 140, 'windspeed_10m': 27.1}, {'time': '2025-03-19T11:00', 'winddirection_10m': 141, 'windspeed_10m': 29.2}
+            ]
+            ```
+    """
+    url = "https://marine-api.open-meteo.com/v1/marine"
+    params = {
+        "latitude": lat,
+        "longitude": lon,
+        "hourly": [
+            "wave_direction",
+            "wave_height",
+            "wave_period",
+            "sea_level_height_msl",
+        ],
+    }
+    response = requests.get(url, params=params)
+    response.raise_for_status()
+    data = json.loads(response.content.decode())
+    hourly_data = _extract_hourly_data(data)
+    if date is not None:
+        date = datetime.fromisoformat(date)
+        hourly_data = _filter_by_date(date, hourly_data)
+    if len(hourly_data) == 0:
+        raise ValueError("No data found for the given date")
+    return hourly_data
+def get_wind_forecast(lat: float, lon: float, date: str) -> list[dict]:
+    """Get wind forecast for given location.
+    Forecast will include:
+    - wind_direction (degrees)
+    - wind_speed (meters per second)
+    Args:
+        lat: Latitude of the location.
+        lon: Longitude of the location.
+        date: Date to filter by in any valid ISO 8601 format.
+    Returns:
+        Hourly data for wind forecast.
+            Example output:
+            ```json
+            [
+                {"time": "2025-03-18T22:00", "wind_direction": 196, "wind_speed": 9.6},
+                {"time": "2025-03-18T23:00", "wind_direction": 183, "wind_speed": 7.9},
+            ]
+            ```
+    """
+    url = "https://api.open-meteo.com/v1/forecast"
+    params = {
+        "latitude": lat,
+        "longitude": lon,
+        "hourly": ["winddirection_10m", "windspeed_10m"],
+    }
+    response = requests.get(url, params=params)
+    response.raise_for_status()
+    data = json.loads(response.content.decode())
+    hourly_data = _extract_hourly_data(data)
+    date = datetime.fromisoformat(date)
+    hourly_data = _filter_by_date(date, hourly_data)
+    if len(hourly_data) == 0:
+        raise ValueError("No data found for the given date")
+    return hourly_data

tools/openstreetmap.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import json
+import requests
+def get_area_lat_lon(area_name: str) -> tuple[float, float]:
+    """Get the latitude and longitude of an area from Nominatim.
+    Uses the [Nominatim API](https://nominatim.org/release-docs/develop/api/Search/).
+    Args:
+        area_name: The name of the area.
+    Returns:
+        The area found.
+    """
+    response = requests.get(
+        f"https://nominatim.openstreetmap.org/search?q={area_name}&format=jsonv2",
+        headers={"User-Agent": "Mozilla/5.0"},
+    )
+    response.raise_for_status()
+    area = json.loads(response.content.decode())
+    return area[0]["lat"], area[0]["lon"]
+def driving_hours_to_meters(driving_hours: int) -> int:
+    """Convert driving hours to meters assuming a 70 km/h average speed.
+    Args:
+        driving_hours: The driving hours.
+    Returns:
+        The distance in meters.
+    """
+    return driving_hours * 70 * 1000
+def get_lat_lon_center(bounds: dict) -> tuple[float, float]:
+    """Get the latitude and longitude of the center of a bounding box.
+    Args:
+        bounds: The bounding box.
+            ```json
+            {
+                "minlat": float,
+                "minlon": float,
+                "maxlat": float,
+                "maxlon": float,
+            }
+            ```
+    Returns:
+        The latitude and longitude of the center.
+    """
+    return (
+        (bounds["minlat"] + bounds["maxlat"]) / 2,
+        (bounds["minlon"] + bounds["maxlon"]) / 2,
+    )